[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfcmsan-extract-handleselectlikeinst

Created using spr 1.3.4 [skip ci]
author: Vitaly Buka <vitalybuka@google.com> 2024-06-10 11:47:48 -0700
committer: Vitaly Buka <vitalybuka@google.com> 2024-06-10 11:47:48 -0700
commit: 68917b378f7cd70374cb19ea2427a696d20ac8b3 (patch)
tree: 995077bc71b75c3eb04a584ceedd2b5d643e07cd
parent: f8dc17608cf7aa14326bc70e343d802eec7f399f (diff)
parent: 870bfad71a5bc84102374d94812cf063552493b9 (diff)
download: llvm-users/vitalybuka/spr/main.nfcmsan-extract-handleselectlikeinst.zip
llvm-users/vitalybuka/spr/main.nfcmsan-extract-handleselectlikeinst.tar.gz
llvm-users/vitalybuka/spr/main.nfcmsan-extract-handleselectlikeinst.tar.bz2
644 files changed, 17782 insertions, 12129 deletions
diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 033ab80..fd603de 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -153,7 +153,6 @@ function exclude-linux() {
   for project in ${projects}; do
     case ${project} in
     cross-project-tests) ;; # tests failing
-    lldb)                ;; # tests failing
     openmp)              ;; # https://github.com/google/llvm-premerge-checks/issues/410
     *)
       echo "${project}"
@@ -170,7 +169,7 @@ function exclude-windows() {
     compiler-rt)         ;; # tests taking too long
     openmp)              ;; # TODO: having trouble with the Perl installation
     libc)                ;; # no Windows support
-    lldb)                ;; # tests failing
+    lldb)                ;; # custom environment requirements (https://github.com/llvm/llvm-project/pull/94208#issuecomment-2146256857)
     bolt)                ;; # tests are not supported yet
     *)
       echo "${project}"
@@ -213,7 +212,7 @@ function check-targets() {
       echo "check-unwind"
     ;;
     lldb)
-      echo "check-all" # TODO: check-lldb may not include all the LLDB tests?
+      echo "check-lldb"
     ;;
     pstl)
       echo "check-all"
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 38d7128..b78dc59 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -39,6 +39,7 @@ targets="${2}"
 
 echo "--- cmake"
 pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt
+pip install -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt
 cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
diff --git a/.github/workflows/ci-post-commit-analyzer-run.py b/.github/workflows/ci-post-commit-analyzer-run.py
new file mode 100644
index 0000000..e5f52d3
--- /dev/null
+++ b/.github/workflows/ci-post-commit-analyzer-run.py
@@ -0,0 +1,34 @@
+import json
+import multiprocessing
+import os
+import re
+import subprocess
+import sys
+
+
+def run_analyzer(data):
+    os.chdir(data["directory"])
+    command = (
+        data["command"]
+        + f" --analyze --analyzer-output html -o analyzer-results -Xclang -analyzer-config -Xclang max-nodes=75000"
+    )
+    print(command)
+    subprocess.run(command, shell=True, check=True)
+
+
+def pool_error(e):
+    print("Error analyzing file:", e)
+
+
+def main():
+    db_path = sys.argv[1]
+    database = json.load(open(db_path))
+
+    with multiprocessing.Pool() as pool:
+        pool.map_async(run_analyzer, [k for k in database], error_callback=pool_error)
+        pool.close()
+        pool.join()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
new file mode 100644
index 0000000..d614dd0
--- /dev/null
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -0,0 +1,95 @@
+name: Post-Commit Static Analyzer
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - '.github/workflows/ci-post-commit-analyzer.yml'
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - closed
+    paths:
+      - '.github/workflows/ci-post-commit-analyzer.yml'
+      - '.github/workflows/ci-post-commit-analyzer-run.py'
+  schedule:
+    - cron: '30 0 * * *'
+
+concurrency:
+  group: >-
+    llvm-project-${{ github.workflow }}-${{ github.event_name == 'pull_request' &&
+      ( github.event.pull_request.number || github.ref) }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  post-commit-analyzer:
+    if: >-
+      github.repository_owner == 'llvm' &&
+      github.event.action != 'closed'
+    runs-on: ubuntu-22.04
+    container:
+      image: 'ghcr.io/llvm/ci-ubuntu-22.04:latest'
+    env:
+      LLVM_VERSION: 18
+    steps:
+      - name: Checkout Source
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1
+        with:
+          # A full build of llvm, clang, lld, and lldb takes about 250MB
+          # of ccache space. There's not much reason to have more than this,
+          # because we usually won't need to save cache entries from older
+          # builds.  Also, there is an overall 10GB cache limit, and each
+          # run creates a new cache entry so we want to ensure that we have
+          # enough cache space for all the tests to run at once and still
+          # fit under the 10 GB limit.
+          # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
+          max-size: 2G
+          key: post-commit-analyzer
+          variant: sccache
+
+      - name: Configure
+        run: |
+              cmake -B build -S llvm -G Ninja \
+                  -DLLVM_ENABLE_ASSERTIONS=ON \
+                  -DLLVM_ENABLE_PROJECTS=clang \
+                  -DLLVM_BUILD_LLVM_DYLIB=ON \
+                  -DLLVM_LINK_LLVM_DYLIB=ON \
+                  -DCMAKE_CXX_COMPILER=clang++ \
+                  -DCMAKE_C_COMPILER=clang \
+                  -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
+                  -DCMAKE_C_COMPILER_LAUNCHER=sccache \
+                  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+                  -DLLVM_INCLUDE_TESTS=OFF \
+                  -DCLANG_INCLUDE_TESTS=OFF \
+                  -DCMAKE_BUILD_TYPE=Release
+
+      - name: Build
+        run: |
+          # FIXME: We need to build all the generated header files in order to be able to run
+          # the analyzer on every file.  Building libLLVM and libclang is probably overkill for
+          # this, but it's better than building every target.
+          ninja -v -C build libLLVM.so libclang.so
+
+          # Run the analyzer.
+          python3 .github/workflows/ci-post-commit-analyzer-run.py build/compile_commands.json
+
+          scan-build --generate-index-only build/analyzer-results
+
+      - name: Upload Results
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        if: always()
+        with:
+          name: analyzer-results
+          path: 'build/analyzer-results/*'
+
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 8dec32de..3cc9d82 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -12,6 +12,7 @@
 #include "bolt/Core/DIEBuilder.h"
 #include "bolt/Core/DebugData.h"
 #include "bolt/Core/DebugNames.h"
+#include "bolt/Core/GDBIndex.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DWP/DWP.h"
@@ -131,7 +132,8 @@ private:
   makeFinalLocListsSection(DWARFVersion Version);
 
   /// Finalize type sections in the main binary.
-  CUOffsetMap finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer);
+  CUOffsetMap finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
+                                   GDBIndex &GDBIndexSection);
 
   /// Process and write out CUs that are passsed in.
   void finalizeCompileUnits(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index cdfca2b..519f282 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -304,7 +304,7 @@ std::error_code BoltAddressTranslation::parse(raw_ostream &OS, StringRef Buf) {
 
   StringRef Name = Buf.slice(Offset, Offset + NameSz);
   Offset = alignTo(Offset + NameSz, 4);
-  if (Name.substr(0, 4) != "BOLT")
+  if (!Name.starts_with("BOLT"))
     return make_error_code(llvm::errc::io_error);
 
   Error Err(Error::success());
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 8814ebb..7b62999 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -185,6 +185,7 @@ namespace bolt {
 class DIEStreamer : public DwarfStreamer {
   DIEBuilder *DIEBldr;
   DWARFRewriter &Rewriter;
+  GDBIndex &GDBIndexSection;
 
 private:
   /// Emit the compilation unit header for \p Unit in the debug_info
@@ -247,7 +248,7 @@ private:
     const uint64_t TypeSignature = cast<DWARFTypeUnit>(Unit).getTypeHash();
     DIE *TypeDIE = DIEBldr->getTypeDIE(Unit);
     const DIEBuilder::DWARFUnitInfo &UI = DIEBldr->getUnitInfoByDwarfUnit(Unit);
-    Rewriter.addGDBTypeUnitEntry(
+    GDBIndexSection.addGDBTypeUnitEntry(
         {UI.UnitOffset, TypeSignature, TypeDIE->getOffset()});
     if (Unit.getVersion() < 5) {
       // Switch the section to .debug_types section.
@@ -279,11 +280,12 @@ private:
 
 public:
   DIEStreamer(DIEBuilder *DIEBldr, DWARFRewriter &Rewriter,
+              GDBIndex &GDBIndexSection,
               DWARFLinkerBase::OutputFileType OutFileType,
               raw_pwrite_stream &OutFile,
               DWARFLinkerBase::MessageHandlerTy Warning)
       : DwarfStreamer(OutFileType, OutFile, Warning), DIEBldr(DIEBldr),
-        Rewriter(Rewriter){};
+        Rewriter(Rewriter), GDBIndexSection(GDBIndexSection) {};
 
   using DwarfStreamer::emitCompileUnitHeader;
 
@@ -326,12 +328,11 @@ static cl::opt<bool> KeepARanges(
         "keep or generate .debug_aranges section if .gdb_index is written"),
     cl::Hidden, cl::cat(BoltCategory));
 
-static cl::opt<bool>
-DeterministicDebugInfo("deterministic-debuginfo",
-  cl::desc("disables parallel execution of tasks that may produce "
-           "nondeterministic debug info"),
-  cl::init(true),
-  cl::cat(BoltCategory));
+static cl::opt<bool> DeterministicDebugInfo(
+    "deterministic-debuginfo",
+    cl::desc("disables parallel execution of tasks that may produce "
+             "nondeterministic debug info"),
+    cl::init(true), cl::cat(BoltCategory));
 
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
@@ -460,10 +461,11 @@ static std::optional<uint64_t> getAsAddress(const DWARFUnit &DU,
 static std::unique_ptr<DIEStreamer>
 createDIEStreamer(const Triple &TheTriple, raw_pwrite_stream &OutFile,
                   StringRef Swift5ReflectionSegmentName, DIEBuilder &DIEBldr,
-                  DWARFRewriter &Rewriter) {
+                  DWARFRewriter &Rewriter, GDBIndex &GDBIndexSection) {
 
   std::unique_ptr<DIEStreamer> Streamer = std::make_unique<DIEStreamer>(
-      &DIEBldr, Rewriter, DWARFLinkerBase::OutputFileType::Object, OutFile,
+      &DIEBldr, Rewriter, GDBIndexSection,
+      DWARFLinkerBase::OutputFileType::Object, OutFile,
       [&](const Twine &Warning, StringRef Context, const DWARFDie *) {});
   Error Err = Streamer->init(TheTriple, Swift5ReflectionSegmentName);
   if (Err)
@@ -484,13 +486,12 @@ emitUnit(DIEBuilder &DIEBldr, DIEStreamer &Streamer, DWARFUnit &Unit) {
   return {U.UnitOffset, U.UnitLength, TypeHash};
 }
 
-static void emitDWOBuilder(const std::string &DWOName,
-                           DIEBuilder &DWODIEBuilder, DWARFRewriter &Rewriter,
-                           DWARFUnit &SplitCU, DWARFUnit &CU,
-                           DWARFRewriter::DWPState &State,
-                           DebugLocWriter &LocWriter,
-                           DebugStrOffsetsWriter &StrOffstsWriter,
-                           DebugStrWriter &StrWriter) {
+static void
+emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
+               DWARFRewriter &Rewriter, DWARFUnit &SplitCU, DWARFUnit &CU,
+               DWARFRewriter::DWPState &State, DebugLocWriter &LocWriter,
+               DebugStrOffsetsWriter &StrOffstsWriter,
+               DebugStrWriter &StrWriter, GDBIndex &GDBIndexSection) {
   // Populate debug_info and debug_abbrev for current dwo into StringRef.
   DWODIEBuilder.generateAbbrevs();
   DWODIEBuilder.finish();
@@ -500,8 +501,9 @@ static void emitDWOBuilder(const std::string &DWOName,
       std::make_shared<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = SplitCU.getContext().getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> Streamer = createDIEStreamer(
-      *TheTriple, *ObjOS, "DwoStreamerInitAug2", DWODIEBuilder, Rewriter);
+  std::unique_ptr<DIEStreamer> Streamer =
+      createDIEStreamer(*TheTriple, *ObjOS, "DwoStreamerInitAug2",
+                        DWODIEBuilder, Rewriter, GDBIndexSection);
   DWARFRewriter::UnitMetaVectorType TUMetaVector;
   DWARFRewriter::UnitMeta CUMI = {0, 0, 0};
   if (SplitCU.getContext().getMaxDWOVersion() >= 5) {
@@ -652,6 +654,7 @@ void DWARFRewriter::updateDebugInfo() {
 
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
                                          *StrWriter);
+  GDBIndex GDBIndexSection(BC);
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
@@ -704,7 +707,8 @@ void DWARFRewriter::updateDebugInfo() {
         TempRangesSectionWriter->finalizeSection();
 
       emitDWOBuilder(DWOName, DWODIEBuilder, *this, **SplitCU, *Unit, State,
-                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter);
+                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
+                     GDBIndexSection);
     }
 
     if (Unit->getVersion() >= 5) {
@@ -729,9 +733,10 @@ void DWARFRewriter::updateDebugInfo() {
       std::make_unique<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> Streamer =
-      createDIEStreamer(*TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this);
-  CUOffsetMap OffsetMap = finalizeTypeSections(DIEBlder, *Streamer);
+  std::unique_ptr<DIEStreamer> Streamer = createDIEStreamer(
+      *TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this, GDBIndexSection);
+  CUOffsetMap OffsetMap =
+      finalizeTypeSections(DIEBlder, *Streamer, GDBIndexSection);
 
   const bool SingleThreadedMode =
       opts::NoThreads || opts::DeterministicDebugInfo;
@@ -761,7 +766,8 @@ void DWARFRewriter::updateDebugInfo() {
 
   finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS,
                         OffsetMap);
-  updateGdbIndexSection(OffsetMap, CUIndex);
+  GDBIndexSection.updateGdbIndexSection(OffsetMap, CUIndex,
+                                        *ARangesSectionWriter);
 }
 
 void DWARFRewriter::updateUnitDebugInfo(
@@ -1429,7 +1435,8 @@ void DWARFRewriter::updateLineTableOffsets(const MCAsmLayout &Layout) {
 }
 
 CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
-                                                DIEStreamer &Streamer) {
+                                                DIEStreamer &Streamer,
+                                                GDBIndex &GDBIndexSection) {
   // update TypeUnit DW_AT_stmt_list with new .debug_line information.
   auto updateLineTable = [&](const DWARFUnit &Unit) -> void {
     DIE *UnitDIE = DIEBlder.getUnitDIEbyUnit(Unit);
@@ -1449,8 +1456,8 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
       std::make_shared<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> TypeStreamer =
-      createDIEStreamer(*TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this);
+  std::unique_ptr<DIEStreamer> TypeStreamer = createDIEStreamer(
+      *TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this, GDBIndexSection);
 
   // generate debug_info and CUMap
   CUOffsetMap CUMap;
diff --git a/clang-tools-extra/clang-query/QueryParser.cpp b/clang-tools-extra/clang-query/QueryParser.cpp
index 1d0b7d9..97cb264 100644
--- a/clang-tools-extra/clang-query/QueryParser.cpp
+++ b/clang-tools-extra/clang-query/QueryParser.cpp
@@ -144,13 +144,11 @@ QueryRef QueryParser::endQuery(QueryRef Q) {
   StringRef Extra = Line;
   StringRef ExtraTrimmed = Extra.ltrim(" \t\v\f\r");
 
-  if ((!ExtraTrimmed.empty() && ExtraTrimmed[0] == '\n') ||
-      (ExtraTrimmed.size() >= 2 && ExtraTrimmed[0] == '\r' &&
-       ExtraTrimmed[1] == '\n'))
+  if (ExtraTrimmed.starts_with('\n') || ExtraTrimmed.starts_with("\r\n"))
     Q->RemainingContent = Extra;
   else {
     StringRef TrailingWord = lexWord();
-    if (!TrailingWord.empty() && TrailingWord.front() == '#') {
+    if (TrailingWord.starts_with('#')) {
       Line = Line.drop_until([](char c) { return c == '\n'; });
       Line = Line.drop_while([](char c) { return c == '\n'; });
       return endQuery(Q);
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
index 4dd3cb5..7a989b0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
@@ -48,12 +48,21 @@ AST_MATCHER(ImplicitCastExpr, isMultiLevelPointerConversion) {
   return SourcePtrLevel != TargetPtrLevel;
 }
 
+AST_MATCHER(QualType, isPointerType) {
+  const QualType Type =
+      Node.getCanonicalType().getNonReferenceType().getUnqualifiedType();
+
+  return !Type.isNull() && Type->isPointerType();
+}
+
 } // namespace
 
 void MultiLevelImplicitPointerConversionCheck::registerMatchers(
     MatchFinder *Finder) {
   Finder->addMatcher(
-      implicitCastExpr(hasCastKind(CK_BitCast), isMultiLevelPointerConversion())
+      implicitCastExpr(hasCastKind(CK_BitCast), isMultiLevelPointerConversion(),
+                       unless(hasParent(explicitCastExpr(
+                           hasDestinationType(isPointerType())))))
           .bind("expr"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
index ebc5338..2a0cc40 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
@@ -32,6 +32,14 @@ static constexpr bool RestrictToPODTypesDefault = false;
 static constexpr char IgnoreMacrosName[] = "IgnoreMacros";
 static constexpr bool IgnoreMacrosDefault = true;
 
+static constexpr char StrictCStandardComplianceName[] =
+    "StrictCStandardCompliance";
+static constexpr bool StrictCStandardComplianceDefault = true;
+
+static constexpr char StrictCppStandardComplianceName[] =
+    "StrictCppStandardCompliance";
+static constexpr bool StrictCppStandardComplianceDefault = true;
+
 namespace {
 
 struct Designators {
@@ -97,7 +105,12 @@ UseDesignatedInitializersCheck::UseDesignatedInitializersCheck(
       RestrictToPODTypes(
           Options.get(RestrictToPODTypesName, RestrictToPODTypesDefault)),
       IgnoreMacros(
-          Options.getLocalOrGlobal(IgnoreMacrosName, IgnoreMacrosDefault)) {}
+          Options.getLocalOrGlobal(IgnoreMacrosName, IgnoreMacrosDefault)),
+      StrictCStandardCompliance(Options.get(StrictCStandardComplianceName,
+                                            StrictCStandardComplianceDefault)),
+      StrictCppStandardCompliance(
+          Options.get(StrictCppStandardComplianceName,
+                      StrictCppStandardComplianceDefault)) {}
 
 void UseDesignatedInitializersCheck::registerMatchers(MatchFinder *Finder) {
   const auto HasBaseWithFields =
@@ -179,6 +192,9 @@ void UseDesignatedInitializersCheck::storeOptions(
                 IgnoreSingleElementAggregates);
   Options.store(Opts, RestrictToPODTypesName, RestrictToPODTypes);
   Options.store(Opts, IgnoreMacrosName, IgnoreMacros);
+  Options.store(Opts, StrictCStandardComplianceName, StrictCStandardCompliance);
+  Options.store(Opts, StrictCppStandardComplianceName,
+                StrictCppStandardCompliance);
 }
 
 } // namespace clang::tidy::modernize
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h
index 0a496f5..79095ad 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h
@@ -29,10 +29,19 @@ public:
     return TK_IgnoreUnlessSpelledInSource;
   }
 
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus20 || LangOpts.C99 ||
+           (LangOpts.CPlusPlus && !StrictCppStandardCompliance) ||
+           (!LangOpts.CPlusPlus && !LangOpts.ObjC &&
+            !StrictCStandardCompliance);
+  }
+
 private:
   bool IgnoreSingleElementAggregates;
   bool RestrictToPODTypes;
   bool IgnoreMacros;
+  bool StrictCStandardCompliance;
+  bool StrictCppStandardCompliance;
 };
 
 } // namespace clang::tidy::modernize
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
index 9beb185..61240fa 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
@@ -75,16 +75,16 @@ void recordRemoval(const DeclStmt &Stmt, ASTContext &Context,
   }
 }
 
-AST_MATCHER_FUNCTION_P(StatementMatcher, isConstRefReturningMethodCall,
+AST_MATCHER_FUNCTION_P(StatementMatcher,
+                       isRefReturningMethodCallWithConstOverloads,
                        std::vector<StringRef>, ExcludedContainerTypes) {
   // Match method call expressions where the `this` argument is only used as
-  // const, this will be checked in `check()` part. This returned const
-  // reference is highly likely to outlive the local const reference of the
-  // variable being declared. The assumption is that the const reference being
-  // returned either points to a global static variable or to a member of the
-  // called object.
+  // const, this will be checked in `check()` part. This returned reference is
+  // highly likely to outlive the local const reference of the variable being
+  // declared. The assumption is that the reference being returned either points
+  // to a global static variable or to a member of the called object.
   const auto MethodDecl =
-      cxxMethodDecl(returns(hasCanonicalType(matchers::isReferenceToConst())))
+      cxxMethodDecl(returns(hasCanonicalType(referenceType())))
           .bind(MethodDeclId);
   const auto ReceiverExpr =
       ignoringParenImpCasts(declRefExpr(to(varDecl().bind(ObjectArgId))));
@@ -121,7 +121,7 @@ AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst,
       declRefExpr(to(varDecl(hasLocalStorage()).bind(OldVarDeclId)));
   return expr(
       anyOf(isConstRefReturningFunctionCall(),
-            isConstRefReturningMethodCall(ExcludedContainerTypes),
+            isRefReturningMethodCallWithConstOverloads(ExcludedContainerTypes),
             ignoringImpCasts(OldVarDeclRef),
             ignoringImpCasts(unaryOperator(hasOperatorName("&"),
                                            hasUnaryOperand(OldVarDeclRef)))));
@@ -259,10 +259,11 @@ void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) {
         .bind("blockStmt");
   };
 
-  Finder->addMatcher(LocalVarCopiedFrom(anyOf(isConstRefReturningFunctionCall(),
-                                              isConstRefReturningMethodCall(
-                                                  ExcludedContainerTypes))),
-                     this);
+  Finder->addMatcher(
+      LocalVarCopiedFrom(anyOf(
+          isConstRefReturningFunctionCall(),
+          isRefReturningMethodCallWithConstOverloads(ExcludedContainerTypes))),
+      this);
 
   Finder->addMatcher(LocalVarCopiedFrom(declRefExpr(
                          to(varDecl(hasLocalStorage()).bind(OldVarDeclId)))),
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 28f5ead..aa115cd 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -279,6 +279,9 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
       hasParent(callExpr()),
       hasSourceExpression(binaryOperator(hasAnyOperatorName("==", "!="))));
 
+  auto IsInCompilerGeneratedFunction = hasAncestor(namedDecl(anyOf(
+      isImplicit(), functionDecl(isDefaulted()), functionTemplateDecl())));
+
   Finder->addMatcher(
       traverse(TK_AsIs,
                implicitCastExpr(
@@ -299,7 +302,7 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
                    // additional parens in replacement.
                    optionally(hasParent(stmt().bind("parentStmt"))),
                    unless(isInTemplateInstantiation()),
-                   unless(hasAncestor(functionTemplateDecl())))
+                   unless(IsInCompilerGeneratedFunction))
                    .bind("implicitCastToBool")),
       this);
 
@@ -331,7 +334,7 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
               anyOf(hasParent(implicitCastExpr().bind("furtherImplicitCast")),
                     anything()),
               unless(isInTemplateInstantiation()),
-              unless(hasAncestor(functionTemplateDecl())))),
+              unless(IsInCompilerGeneratedFunction))),
       this);
 }
 
diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
index 65fd296..64ce94e 100644
--- a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
@@ -57,7 +57,8 @@ static void addParantheses(const BinaryOperator *BinOp,
   int Precedence1 = getPrecedence(BinOp);
   int Precedence2 = getPrecedence(ParentBinOp);
 
-  if (ParentBinOp != nullptr && Precedence1 != Precedence2) {
+  if (ParentBinOp != nullptr && Precedence1 != Precedence2 && Precedence1 > 0 &&
+      Precedence2 > 0) {
     const clang::SourceLocation StartLoc = BinOp->getBeginLoc();
     const clang::SourceLocation EndLoc =
         clang::Lexer::getLocForEndOfToken(BinOp->getEndLoc(), 0, SM, LangOpts);
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index a48e45e..a6062cc 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -36,6 +36,116 @@ void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID,
     Nodes.insert(Match.getNodeAs<Node>(ID));
 }
 
+// Returns true if both types refer to the same type,
+// ignoring the const-qualifier.
+bool isSameTypeIgnoringConst(QualType A, QualType B) {
+  A = A.getCanonicalType();
+  B = B.getCanonicalType();
+  A.addConst();
+  B.addConst();
+  return A == B;
+}
+
+// Returns true if `D` and `O` have the same parameter types.
+bool hasSameParameterTypes(const CXXMethodDecl &D, const CXXMethodDecl &O) {
+  if (D.getNumParams() != O.getNumParams())
+    return false;
+  for (int I = 0, E = D.getNumParams(); I < E; ++I) {
+    if (!isSameTypeIgnoringConst(D.getParamDecl(I)->getType(),
+                                 O.getParamDecl(I)->getType()))
+      return false;
+  }
+  return true;
+}
+
+// If `D` has a const-qualified overload with otherwise identical
+// ref-qualifiers and parameter types, returns that overload.
+const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
+  assert(!D.isConst());
+
+  DeclContext::lookup_result LookupResult =
+      D.getParent()->lookup(D.getNameInfo().getName());
+  if (LookupResult.isSingleResult()) {
+    // No overload.
+    return nullptr;
+  }
+  for (const Decl *Overload : LookupResult) {
+    const auto *O = dyn_cast<CXXMethodDecl>(Overload);
+    if (O && !O->isDeleted() && O->isConst() &&
+        O->getRefQualifier() == D.getRefQualifier() &&
+        hasSameParameterTypes(D, *O))
+      return O;
+  }
+  return nullptr;
+}
+
+// Returns true if both types are pointers or reference to the same type,
+// ignoring the const-qualifier.
+bool pointsToSameTypeIgnoringConst(QualType A, QualType B) {
+  assert(A->isPointerType() || A->isReferenceType());
+  assert(B->isPointerType() || B->isReferenceType());
+  return isSameTypeIgnoringConst(A->getPointeeType(), B->getPointeeType());
+}
+
+// Return true if non-const member function `M` likely does not mutate `*this`.
+//
+// Note that if the member call selects a method/operator `f` that
+// is not const-qualified, then we also consider that the object is
+// not mutated if:
+//  - (A) there is a const-qualified overload `cf` of `f` that has
+//  the
+//    same ref-qualifiers;
+//  - (B) * `f` returns a value, or
+//        * if `f` returns a `T&`, `cf` returns a `const T&` (up to
+//          possible aliases such as `reference` and
+//          `const_reference`), or
+//        * if `f` returns a `T*`, `cf` returns a `const T*` (up to
+//          possible aliases).
+//  - (C) the result of the call is not mutated.
+//
+// The assumption that `cf` has the same semantics as `f`.
+// For example:
+//   - In `std::vector<T> v; const T t = v[...];`, we consider that
+//     expression `v[...]` does not mutate `v` as
+//    `T& std::vector<T>::operator[]` has a const overload
+//     `const T& std::vector<T>::operator[] const`, and the
+//     result expression of type `T&` is only used as a `const T&`;
+//   - In `std::map<K, V> m; V v = m.at(...);`, we consider
+//     `m.at(...)` to be an immutable access for the same reason.
+// However:
+//   - In `std::map<K, V> m; const V v = m[...];`, We consider that
+//     `m[...]` mutates `m` as `V& std::map<K, V>::operator[]` does
+//     not have a const overload.
+//   - In `std::vector<T> v; T& t = v[...];`, we consider that
+//     expression `v[...]` mutates `v` as the result is kept as a
+//     mutable reference.
+//
+// This function checks (A) ad (B), but the caller should make sure that the
+// object is not mutated through the return value.
+bool isLikelyShallowConst(const CXXMethodDecl &M) {
+  assert(!M.isConst());
+  // The method can mutate our variable.
+
+  // (A)
+  const CXXMethodDecl *ConstOverload = findConstOverload(M);
+  if (ConstOverload == nullptr) {
+    return false;
+  }
+
+  // (B)
+  const QualType CallTy = M.getReturnType().getCanonicalType();
+  const QualType OverloadTy = ConstOverload->getReturnType().getCanonicalType();
+  if (CallTy->isReferenceType()) {
+    return OverloadTy->isReferenceType() &&
+           pointsToSameTypeIgnoringConst(CallTy, OverloadTy);
+  }
+  if (CallTy->isPointerType()) {
+    return OverloadTy->isPointerType() &&
+           pointsToSameTypeIgnoringConst(CallTy, OverloadTy);
+  }
+  return isSameTypeIgnoringConst(CallTy, OverloadTy);
+}
+
 // A matcher that matches DeclRefExprs that are used in ways such that the
 // underlying declaration is not modified.
 // If the declaration is of pointer type, `Indirections` specifies the level
@@ -54,16 +164,15 @@ void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID,
 //  matches (A).
 //
 AST_MATCHER_P(DeclRefExpr, doesNotMutateObject, int, Indirections) {
-  // We walk up the parents of the DeclRefExpr recursively until we end up on a
-  // parent that cannot modify the underlying object. There are a few kinds of
-  // expressions:
-  //  - Those that cannot be used to mutate the underlying object. We can stop
+  // We walk up the parents of the DeclRefExpr recursively. There are a few
+  // kinds of expressions:
+  //  - Those that cannot be used to mutate the underlying variable. We can stop
   //    recursion there.
-  //  - Those that can be used to mutate the underlying object in analyzable
+  //  - Those that can be used to mutate the underlying variable in analyzable
   //    ways (such as taking the address or accessing a subobject). We have to
   //    examine the parents.
   //  - Those that we don't know how to analyze. In that case we stop there and
-  //    we assume that they can mutate the underlying expression.
+  //    we assume that they can modify the expression.
 
   struct StackEntry {
     StackEntry(const Expr *E, int Indirections)
@@ -90,7 +199,7 @@ AST_MATCHER_P(DeclRefExpr, doesNotMutateObject, int, Indirections) {
       assert(Ty->isPointerType());
       Ty = Ty->getPointeeType().getCanonicalType();
     }
-    if (Ty.isConstQualified())
+    if (Ty->isVoidType() || Ty.isConstQualified())
       continue;
 
     // Otherwise we have to look at the parents to see how the expression is
@@ -159,11 +268,56 @@ AST_MATCHER_P(DeclRefExpr, doesNotMutateObject, int, Indirections) {
             // The method call cannot mutate our variable.
             continue;
           }
+          if (isLikelyShallowConst(*Method)) {
+            // We still have to check that the object is not modified through
+            // the method's return value (C).
+            const auto MemberParents = Ctx.getParents(*Member);
+            assert(MemberParents.size() == 1);
+            const auto *Call = MemberParents[0].get<CallExpr>();
+            // If `o` is an object of class type and `f` is a member function,
+            // then `o.f` has to be used as part of a call expression.
+            assert(Call != nullptr && "member function has to be called");
+            Stack.emplace_back(
+                Call,
+                Method->getReturnType().getCanonicalType()->isPointerType()
+                    ? 1
+                    : 0);
+            continue;
+          }
           return false;
         }
         Stack.emplace_back(Member, 0);
         continue;
       }
+      if (const auto *const OpCall = dyn_cast<CXXOperatorCallExpr>(P)) {
+        // Operator calls have function call syntax. The `*this` parameter
+        // is the first parameter.
+        if (OpCall->getNumArgs() == 0 || OpCall->getArg(0) != Entry.E) {
+          return false;
+        }
+        const auto *const Method =
+            dyn_cast<CXXMethodDecl>(OpCall->getDirectCallee());
+
+        if (Method == nullptr) {
+          // This is not a member operator. Typically, a friend operator. These
+          // are handled like function calls.
+          return false;
+        }
+
+        if (Method->isConst() || Method->isStatic()) {
+          continue;
+        }
+        if (isLikelyShallowConst(*Method)) {
+          // We still have to check that the object is not modified through
+          // the operator's return value (C).
+          Stack.emplace_back(
+              OpCall,
+              Method->getReturnType().getCanonicalType()->isPointerType() ? 1
+                                                                          : 0);
+          continue;
+        }
+        return false;
+      }
 
       if (const auto *const Op = dyn_cast<UnaryOperator>(P)) {
         switch (Op->getOpcode()) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 277a6e7..0c0c106 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -224,6 +224,10 @@ Changes in existing checks
   check by ignoring ``__func__`` macro in lambda captures, initializers of
   default parameters and nested function declarations.
 
+- Improved :doc:`bugprone-multi-level-implicit-pointer-conversion
+  <clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion>` check
+  by ignoring implicit pointer conversions that are part of a cast expression.
+
 - Improved :doc:`bugprone-non-zero-enum-to-bool-conversion
   <clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion>` check by
   eliminating false positives resulting from direct usage of bitwise operators
@@ -372,8 +376,10 @@ Changes in existing checks
 - Improved :doc:`performance-unnecessary-copy-initialization
   <clang-tidy/checks/performance/unnecessary-copy-initialization>` check by
   detecting more cases of constant access. In particular, pointers can be
-  analyzed, se the check now handles the common patterns
+  analyzed, so the check now handles the common patterns
   `const auto e = (*vector_ptr)[i]` and `const auto e = vector_ptr->at(i);`.
+  Calls to mutable function where there exists a `const` overload are also
+  handled.
 
 - Improved :doc:`readability-avoid-return-with-void-value
   <clang-tidy/checks/readability/avoid-return-with-void-value>` check by adding
@@ -408,7 +414,8 @@ Changes in existing checks
   valid fix suggestions for ``static_cast`` without a preceding space and
   fixed problem with duplicate parentheses in double implicit casts. Corrected
   the fix suggestions for C23 and later by using C-style casts instead of
-  ``static_cast``.
+  ``static_cast``. Fixed false positives in C++20 spaceship operator by ignoring
+  casts in implicit and defaulted functions.
 
 - Improved :doc:`readability-redundant-inline-specifier
   <clang-tidy/checks/readability/redundant-inline-specifier>` check to properly
diff --git a/clang-tools-extra/docs/clang-tidy/checks/gen-static-analyzer-docs.py b/clang-tools-extra/docs/clang-tidy/checks/gen-static-analyzer-docs.py
index 6545a39..53ecb60 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/gen-static-analyzer-docs.py
+++ b/clang-tools-extra/docs/clang-tidy/checks/gen-static-analyzer-docs.py
@@ -47,7 +47,7 @@ def get_checkers(checkers_td, checkers_rst):
         parent_package_ = package["ParentPackage"]
         hidden = (checker["Hidden"] != 0) or (package["Hidden"] != 0)
 
-        while parent_package_ != None:
+        while parent_package_ is not None:
             parent_package = table_entries[parent_package_["def"]]
             checker_package_prefix = (
                 parent_package["PackageName"] + "." + checker_package_prefix
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-designated-initializers.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-designated-initializers.rst
index 22f5098..f101cfc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-designated-initializers.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-designated-initializers.rst
@@ -37,7 +37,7 @@ declaration of ``S``.
 
 Even when compiling in a language version older than C++20, depending on your
 compiler, designated initializers are potentially supported. Therefore, the
-check is not restricted to C++20 and newer versions. Check out the options
+check is by default restricted to C99/C++20 and above. Check out the options
 ``-Wc99-designator`` to get support for mixed designators in initializer list in
 C and ``-Wc++20-designator`` for support of designated initializers in older C++
 language modes.
@@ -60,3 +60,13 @@ Options
     The value `true` specifies that only Plain Old Data (POD) types shall be
     checked. This makes the check applicable to even older C++ standards. The
     default value is `false`.
+
+.. option:: StrictCStandardCompliance
+
+   When set to `false`, the check will not restrict itself to C99 and above.
+   The default value is `true`.
+
+.. option:: StrictCppStandardCompliance
+
+   When set to `false`, the check will not restrict itself to C++20 and above.
+   The default value is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/multi-level-implicit-pointer-conversion.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/multi-level-implicit-pointer-conversion.cpp
index 7a56242..6868f9e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/multi-level-implicit-pointer-conversion.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/multi-level-implicit-pointer-conversion.cpp
@@ -63,3 +63,15 @@ void test()
 
   takeSecondLevelVoidPtr(getSecondLevelVoidPtr());
 }
+
+namespace PR93959 {
+  void free(void*);
+
+  void test() {
+    char **p = nullptr;
+    free(p);
+    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: multilevel pointer conversion from 'char **' to 'void *', please use explicit cast [bugprone-multi-level-implicit-pointer-conversion]
+    free((void *)p);
+    free(static_cast<void *>(p));
+  }
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-designated-initializers.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-designated-initializers.cpp
index 7e5c26e3..9b769ad 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-designated-initializers.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-designated-initializers.cpp
@@ -1,13 +1,13 @@
-// RUN: %check_clang_tidy -std=c++17 %s modernize-use-designated-initializers %t \
+// RUN: %check_clang_tidy -std=c++20 %s modernize-use-designated-initializers %t \
 // RUN:     -- \
 // RUN:     -- -fno-delayed-template-parsing
-// RUN: %check_clang_tidy -check-suffixes=,SINGLE-ELEMENT -std=c++17 %s modernize-use-designated-initializers %t \
+// RUN: %check_clang_tidy -check-suffixes=,SINGLE-ELEMENT -std=c++20 %s modernize-use-designated-initializers %t \
 // RUN:     -- -config="{CheckOptions: {modernize-use-designated-initializers.IgnoreSingleElementAggregates: false}}" \
 // RUN:     -- -fno-delayed-template-parsing
-// RUN: %check_clang_tidy -check-suffixes=POD -std=c++17 %s modernize-use-designated-initializers %t \
+// RUN: %check_clang_tidy -check-suffixes=POD -std=c++20 %s modernize-use-designated-initializers %t \
 // RUN:     -- -config="{CheckOptions: {modernize-use-designated-initializers.RestrictToPODTypes: true}}" \
 // RUN:     -- -fno-delayed-template-parsing
-// RUN: %check_clang_tidy -check-suffixes=,MACROS -std=c++17 %s modernize-use-designated-initializers %t \
+// RUN: %check_clang_tidy -check-suffixes=,MACROS -std=c++20 %s modernize-use-designated-initializers %t \
 // RUN:     -- -config="{CheckOptions: {modernize-use-designated-initializers.IgnoreMacros: false}}" \
 // RUN:     -- -fno-delayed-template-parsing
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
index 92625cc..f259552 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
@@ -32,6 +32,9 @@ struct ExpensiveToCopyType {
 
 template <typename T>
 struct Container {
+  using reference = T&;
+  using const_reference = const T&;
+
   bool empty() const;
   const T& operator[](int) const;
   const T& operator[](int);
@@ -42,8 +45,8 @@ struct Container {
   void nonConstMethod();
   bool constMethod() const;
 
-  const T& at(int) const;
-  T& at(int);
+  reference at(int) const;
+  const_reference at(int);
 
 };
 
@@ -207,6 +210,28 @@ void PositiveOperatorCallConstValueParam(const Container<ExpensiveToCopyType> C)
   VarCopyConstructed.constMethod();
 }
 
+void PositiveOperatorValueParam(Container<ExpensiveToCopyType> C) {
+  const auto AutoAssigned = C[42];
+  // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoAssigned'
+  // CHECK-FIXES: const auto& AutoAssigned = C[42];
+  AutoAssigned.constMethod();
+
+  const auto AutoCopyConstructed(C[42]);
+  // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoCopyConstructed'
+  // CHECK-FIXES: const auto& AutoCopyConstructed(C[42]);
+  AutoCopyConstructed.constMethod();
+
+  const ExpensiveToCopyType VarAssigned = C.at(42);
+  // CHECK-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarAssigned'
+  // CHECK-FIXES: const ExpensiveToCopyType& VarAssigned = C.at(42);
+  VarAssigned.constMethod();
+
+  const ExpensiveToCopyType VarCopyConstructed(C.at(42));
+  // CHECK-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarCopyConstructed'
+  // CHECK-FIXES: const ExpensiveToCopyType& VarCopyConstructed(C.at(42));
+  VarCopyConstructed.constMethod();
+}
+
 void PositiveOperatorCallConstValueParamAlias(const ExpensiveToCopyContainerAlias C) {
   const auto AutoAssigned = C[42];
   // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoAssigned'
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-cxx20.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-cxx20.cpp
new file mode 100644
index 0000000..13aa5c5
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-cxx20.cpp
@@ -0,0 +1,31 @@
+// RUN: %check_clang_tidy -std=c++20 %s readability-implicit-bool-conversion %t
+
+namespace std {
+struct strong_ordering {
+  int n;
+  constexpr operator int() const { return n; }
+  static const strong_ordering equal, greater, less;
+};
+constexpr strong_ordering strong_ordering::equal = {0};
+constexpr strong_ordering strong_ordering::greater = {1};
+constexpr strong_ordering strong_ordering::less = {-1};
+} // namespace std
+
+namespace PR93409 {
+  struct X
+  {
+      auto operator<=>(const X&) const = default;
+      bool m_b;
+  };
+
+  struct Y
+  {
+      auto operator<=>(const Y&) const = default;
+      X m_x;
+  };
+  
+  bool compare(const Y& y1, const Y& y2)
+  {
+     return y1 == y2 || y1 < y2 || y1 > y2;
+  }
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp
index a6045c0..4face0b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp
@@ -140,3 +140,20 @@ void f(){
     //CHECK-MESSAGES: :[[@LINE+1]]:13: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
     int v = FUN5(0 + 1);
 }
+
+namespace PR92516 {
+  void f(int i) {
+    int j, k;
+    for (j = i + 1, k = 0; j < 1; ++j) {}
+  }
+
+  void f2(int i) {
+    int j;
+    for (j = i + 1; j < 1; ++j) {}
+  }
+
+  void f3(int i) {
+    int j;
+    for (j = i + 1, 2; j < 1; ++j) {}
+  }
+}
diff --git a/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp b/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp
index 3d9f51e..064e04c 100644
--- a/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp
+++ b/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp
@@ -46,6 +46,7 @@ template <int Indirections> void RunTest(StringRef Snippet) {
   StringRef CommonCode = R"(
     struct ConstTag{};
     struct NonConstTag{};
+    struct Tag1{};
 
     struct S {
       void constMethod() const;
@@ -59,6 +60,13 @@ template <int Indirections> void RunTest(StringRef Snippet) {
       void operator[](int);
       void operator[](int) const;
 
+      int& at(int);
+      const int& at(int) const;
+      const int& at(Tag1);
+
+      int& weird_overload();
+      const double& weird_overload() const;
+
       bool operator==(const S&) const;
 
       int int_member;
@@ -161,9 +169,11 @@ TEST(ConstReferenceDeclRefExprsTest, ConstRefVar) {
       useIntConstRef(/*const*/target.int_member);
       useIntPtr(/*const*/target.ptr_member);
       useIntConstPtr(&/*const*/target.int_member);
+      (void)/*const*/target.at(3);
 
       const S& const_target_ref = /*const*/target;
       const S* const_target_ptr = &/*const*/target;
+      (void)/*const*/target.at(3);
     }
 )");
 }
@@ -187,7 +197,7 @@ TEST(ConstReferenceDeclRefExprsTest, ValueVar) {
       /*const*/target.staticMethod();
       target.nonConstMethod();
       /*const*/target(ConstTag{});
-      target[42];
+      /*const*/target[42];
       /*const*/target(ConstTag{});
       target(NonConstTag{});
       useRef(target);
@@ -211,6 +221,14 @@ TEST(ConstReferenceDeclRefExprsTest, ValueVar) {
       const S& const_target_ref = /*const*/target;
       const S* const_target_ptr = &/*const*/target;
       S* target_ptr = &target;
+
+      (void)/*const*/target.at(3);
+      ++target.at(3);
+      const int civ = /*const*/target.at(3);
+      const int& cir = /*const*/target.at(3);
+      int& ir = target.at(3);
+      target.at(Tag1{});
+      target.weird_overload();
     }
 )");
 }
@@ -227,7 +245,7 @@ TEST(ConstReferenceDeclRefExprsTest, RefVar) {
       /*const*/target.staticMethod();
       target.nonConstMethod();
       /*const*/target(ConstTag{});
-      target[42];
+      /*const*/target[42];
       useConstRef((/*const*/target));
       (/*const*/target).constMethod();
       (void)(/*const*/target == /*const*/target);
@@ -249,6 +267,14 @@ TEST(ConstReferenceDeclRefExprsTest, RefVar) {
       const S& const_target_ref = /*const*/target;
       const S* const_target_ptr = &/*const*/target;
       S* target_ptr = &target;
+
+      (void)/*const*/target.at(3);
+      ++target.at(3);
+      const int civ = /*const*/target.at(3);
+      const int& cir = /*const*/target.at(3);
+      int& ir = target.at(3);
+      target.at(Tag1{});
+      target.weird_overload();
     }
 )");
 }
@@ -266,8 +292,8 @@ TEST(ConstReferenceDeclRefExprsTest, PtrVar) {
       /*const*/target->staticMethod();
       target->nonConstMethod();
       (*/*const*/target)(ConstTag{});
-      (*target)[42];
-      target->operator[](42);
+      (*/*const*/target)[42];
+      /*const*/target->operator[](42);
       useConstRef((*/*const*/target));
       (/*const*/target)->constMethod();
       (void)(*/*const*/target == */*const*/target);
@@ -284,7 +310,15 @@ TEST(ConstReferenceDeclRefExprsTest, PtrVar) {
 
       const S& const_target_ref = */*const*/target;
       const S* const_target_ptr = /*const*/target;
-      S* target_ptr = target;  // FIXME: we could chect const usage of `target_ptr`.
+      S* target_ptr = target;  // FIXME: we could chect const usage of `target_ptr`
+
+      (void)/*const*/target->at(3);
+      ++target->at(3);
+      const int civ = /*const*/target->at(3);
+      const int& cir = /*const*/target->at(3);
+      int& ir = target->at(3);
+      target->at(Tag1{});
+      target->weird_overload();
     }
 )");
 }
@@ -319,6 +353,10 @@ TEST(ConstReferenceDeclRefExprsTest, ConstPtrVar) {
 
       const S& const_target_ref = */*const*/target;
       const S* const_target_ptr = /*const*/target;
+
+      (void)/*const*/target->at(3);
+      const int civ = /*const*/target->at(3);
+      const int& cir = /*const*/target->at(3);
     }
 )");
 }
diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index 6826d01..e4d0a0c 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -6,21 +6,23 @@
 # on Windows platform.
 #
 # NOTE: the build requires a development ARM Linux root filesystem to use
-# proper target platform depended library and header files:
-#  - create <full-path-to-clang-configs> directory and put the clang configuration
-#    file named <TOOLCHAIN_TARGET_TRIPLE>.cfg into it.
-#  - add the `--sysroot=<path-to-develop-arm-linux-root-fs>` argument into
-#    this configuration file.
-#  - add other necessary target depended clang arguments there, 
-#    such as '-mcpu=cortex-a78' & etc.
+# proper target platform depended library and header files.
+#
+# The build generates a proper clang configuration file with stored
+# --sysroot argument for specified target triple. Also it is possible
+# to specify configuration path via CMake arguments, such as
+#   -DCLANG_CONFIG_FILE_USER_DIR=<full-path-to-clang-configs>
+# and/or
+#   -DCLANG_CONFIG_FILE_SYSTEM_DIR=<full-path-to-clang-configs>
 #
 # See more details here: https://clang.llvm.org/docs/UsersManual.html#configuration-files
 #
 # Configure:
 #  cmake -G Ninja ^
 #       -DTOOLCHAIN_TARGET_TRIPLE=aarch64-unknown-linux-gnu ^
+#       -DTOOLCHAIN_TARGET_SYSROOTFS=<path-to-develop-arm-linux-root-fs> ^
+#       -DTOOLCHAIN_SHARED_LIBS=OFF ^ 
 #       -DCMAKE_INSTALL_PREFIX=../install ^
-#       -DCLANG_CONFIG_FILE_USER_DIR=<full-path-to-clang-configs> ^
 #       -DCMAKE_CXX_FLAGS="-D__OPTIMIZE__" ^
 #       -DREMOTE_TEST_HOST="<hostname>" ^
 #       -DREMOTE_TEST_USER="<ssh_user_name>" ^
@@ -81,6 +83,20 @@ endif()
 
 message(STATUS "Toolchain target triple: ${TOOLCHAIN_TARGET_TRIPLE}")
 
+if (DEFINED TOOLCHAIN_TARGET_SYSROOTFS)
+  message(STATUS "Toolchain target sysroot: ${TOOLCHAIN_TARGET_SYSROOTFS}")
+  # Store the --sysroot argument for the compiler-rt test flags.
+  set(sysroot_flags --sysroot='${TOOLCHAIN_TARGET_SYSROOTFS}')
+  # Generate the clang configuration file for the specified target triple
+  # and store --sysroot in this file.
+  file(WRITE "${CMAKE_BINARY_DIR}/bin/${TOOLCHAIN_TARGET_TRIPLE}.cfg" ${sysroot_flags})
+endif()
+
+# Build the shared libraries for libc++/libc++abi/libunwind.
+if (NOT DEFINED TOOLCHAIN_SHARED_LIBS)
+  set(TOOLCHAIN_SHARED_LIBS OFF)
+endif()
+ 
 if (NOT DEFINED LLVM_TARGETS_TO_BUILD)
   if ("${TOOLCHAIN_TARGET_TRIPLE}" MATCHES "^(armv|arm32)+")
     set(LLVM_TARGETS_TO_BUILD "ARM" CACHE STRING "")
@@ -183,20 +199,21 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_CAN_EXECUTE_TESTS
 
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_USE_BUILTINS_LIBRARY          ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_CXX_LIBRARY                   libcxx CACHE STRING "")
-# Tell Clang to seach C++ headers alongside with the just-built binaries for the C++ compiler-rt tests.
-set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_TEST_COMPILER_CFLAGS          "--stdlib=libc++" CACHE STRING "")
-
+# The compiler-rt tests disable the clang configuration files during the execution by setting CLANG_NO_DEFAULT_CONFIG=1
+# and drops out the --sysroot from there. Provide it explicity via the test flags here if target sysroot has been specified.
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_TEST_COMPILER_CFLAGS          "--stdlib=libc++ ${sysroot_flags}" CACHE STRING "")
+  
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_USE_COMPILER_RT                 ON CACHE BOOL "")
-set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_ENABLE_SHARED                   OFF CACHE BOOL "")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_ENABLE_SHARED                   ${TOOLCHAIN_SHARED_LIBS} CACHE BOOL "")
 
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_USE_LLVM_UNWINDER               ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_ENABLE_STATIC_UNWINDER          ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_USE_COMPILER_RT                 ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS   OFF CACHE BOOL "")
-set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_ENABLE_SHARED                   OFF CACHE BOOL "")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_ENABLE_SHARED                   ${TOOLCHAIN_SHARED_LIBS} CACHE BOOL "")
 
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_USE_COMPILER_RT                    ON CACHE BOOL "")
-set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_SHARED                      OFF CACHE BOOL "")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_SHARED                      ${TOOLCHAIN_SHARED_LIBS} CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ABI_VERSION                        ${LIBCXX_ABI_VERSION} CACHE STRING "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_CXX_ABI                            "libcxxabi" CACHE STRING "")    #!!!
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS      ON CACHE BOOL "")
diff --git a/clang/examples/PrintFunctionNames/PrintFunctionNames.cpp b/clang/examples/PrintFunctionNames/PrintFunctionNames.cpp
index 6509a64..b2b785b 100644
--- a/clang/examples/PrintFunctionNames/PrintFunctionNames.cpp
+++ b/clang/examples/PrintFunctionNames/PrintFunctionNames.cpp
@@ -72,7 +72,7 @@ public:
           *sema.LateParsedTemplateMap.find(FD)->second;
       sema.LateTemplateParser(sema.OpaqueParser, LPT);
       llvm::errs() << "late-parsed-decl: \"" << FD->getNameAsString() << "\"\n";
-    }   
+    }
   }
 };
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index a1d1d1c..8bce4812 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -3203,9 +3203,6 @@ public:
   /// valid feature names.
   ParsedTargetAttr filterFunctionTargetAttrs(const TargetAttr *TD) const;
 
-  std::vector<std::string>
-  filterFunctionTargetVersionAttrs(const TargetVersionAttr *TV) const;
-
   void getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
                              const FunctionDecl *) const;
   void getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
diff --git a/clang/include/clang/AST/TemplateBase.h b/clang/include/clang/AST/TemplateBase.h
index fea2c8c..0eaa4b0 100644
--- a/clang/include/clang/AST/TemplateBase.h
+++ b/clang/include/clang/AST/TemplateBase.h
@@ -459,7 +459,7 @@ public:
              bool IncludeType) const;
 
   /// Debugging aid that dumps the template argument.
-  void dump(raw_ostream &Out) const;
+  void dump(raw_ostream &Out, const ASTContext &Context) const;
 
   /// Debugging aid that dumps the template argument to standard error.
   void dump() const;
diff --git a/clang/include/clang/AST/TemplateName.h b/clang/include/clang/AST/TemplateName.h
index 489fccb..988a55a 100644
--- a/clang/include/clang/AST/TemplateName.h
+++ b/clang/include/clang/AST/TemplateName.h
@@ -340,7 +340,7 @@ public:
              Qualified Qual = Qualified::AsWritten) const;
 
   /// Debugging aid that dumps the template name.
-  void dump(raw_ostream &OS) const;
+  void dump(raw_ostream &OS, const ASTContext &Context) const;
 
   /// Debugging aid that dumps the template name to standard
   /// error.
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 17d9a71..b70b0c8 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4470,37 +4470,20 @@ def HLSLShader : InheritableAttr {
   let Subjects = SubjectList<[HLSLEntry]>;
   let LangOpts = [HLSL];
   let Args = [
-    EnumArgument<"Type", "ShaderType", /*is_string=*/true,
+    EnumArgument<"Type", "llvm::Triple::EnvironmentType", /*is_string=*/true,
                  ["pixel", "vertex", "geometry", "hull", "domain", "compute",
                   "raygeneration", "intersection", "anyhit", "closesthit",
                   "miss", "callable", "mesh", "amplification"],
                  ["Pixel", "Vertex", "Geometry", "Hull", "Domain", "Compute",
                   "RayGeneration", "Intersection", "AnyHit", "ClosestHit",
-                  "Miss", "Callable", "Mesh", "Amplification"]>
+                  "Miss", "Callable", "Mesh", "Amplification"],
+                  /*opt=*/0, /*fake=*/0, /*isExternalType=*/1>
   ];
   let Documentation = [HLSLSV_ShaderTypeAttrDocs];
   let AdditionalMembers =
 [{
-  static const unsigned ShaderTypeMaxValue = (unsigned)HLSLShaderAttr::Amplification;
-
-  static llvm::Triple::EnvironmentType getTypeAsEnvironment(HLSLShaderAttr::ShaderType ShaderType) {
-    switch (ShaderType) {
-      case HLSLShaderAttr::Pixel:         return llvm::Triple::Pixel;
-      case HLSLShaderAttr::Vertex:        return llvm::Triple::Vertex;
-      case HLSLShaderAttr::Geometry:      return llvm::Triple::Geometry;
-      case HLSLShaderAttr::Hull:          return llvm::Triple::Hull;
-      case HLSLShaderAttr::Domain:        return llvm::Triple::Domain;
-      case HLSLShaderAttr::Compute:       return llvm::Triple::Compute;
-      case HLSLShaderAttr::RayGeneration: return llvm::Triple::RayGeneration;
-      case HLSLShaderAttr::Intersection:  return llvm::Triple::Intersection;
-      case HLSLShaderAttr::AnyHit:        return llvm::Triple::AnyHit;
-      case HLSLShaderAttr::ClosestHit:    return llvm::Triple::ClosestHit;
-      case HLSLShaderAttr::Miss:          return llvm::Triple::Miss;
-      case HLSLShaderAttr::Callable:      return llvm::Triple::Callable;
-      case HLSLShaderAttr::Mesh:          return llvm::Triple::Mesh;
-      case HLSLShaderAttr::Amplification: return llvm::Triple::Amplification;
-    }
-    llvm_unreachable("unknown enumeration value");
+  static bool isValidShaderType(llvm::Triple::EnvironmentType ShaderType) {
+    return ShaderType >= llvm::Triple::Pixel && ShaderType <= llvm::Triple::Amplification;
   }
 }];
 }
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index de3aa4b..193eae3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9015,6 +9015,11 @@ def err_cuda_ovl_target : Error<
   "cannot overload %select{__device__|__global__|__host__|__host__ __device__}2 function %3">;
 def note_cuda_ovl_candidate_target_mismatch : Note<
     "candidate template ignored: target attributes do not match">;
+def warn_offload_incompatible_redeclare : Warning<
+  "target-attribute based function overloads are not supported by NVCC and will be treated as a function redeclaration:"
+  "new declaration is %select{__device__|__global__|__host__|__host__ __device__}0 function, "
+  "old declaration is %select{__device__|__global__|__host__|__host__ __device__}1 function">,
+  InGroup<DiagGroup<"nvcc-compat">>, DefaultIgnore;
 
 def err_cuda_device_builtin_surftex_cls_template : Error<
     "illegal device builtin %select{surface|texture}0 reference "
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index e145f5e..0e41a72 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -39,7 +39,7 @@ public:
                                           const AttributeCommonInfo &AL, int X,
                                           int Y, int Z);
   HLSLShaderAttr *mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
-                                  HLSLShaderAttr::ShaderType ShaderType);
+                                  llvm::Triple::EnvironmentType ShaderType);
   HLSLParamModifierAttr *
   mergeParamModifierAttr(Decl *D, const AttributeCommonInfo &AL,
                          HLSLParamModifierAttr::Spelling Spelling);
@@ -48,8 +48,8 @@ public:
   void CheckSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param,
                                const HLSLAnnotationAttr *AnnotationAttr);
   void DiagnoseAttrStageMismatch(
-      const Attr *A, HLSLShaderAttr::ShaderType Stage,
-      std::initializer_list<HLSLShaderAttr::ShaderType> AllowedStages);
+      const Attr *A, llvm::Triple::EnvironmentType Stage,
+      std::initializer_list<llvm::Triple::EnvironmentType> AllowedStages);
   void DiagnoseAvailabilityViolations(TranslationUnitDecl *TU);
 
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index bf74e56..cd76b8a 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -87,6 +87,7 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/Triple.h"
 #include <algorithm>
 #include <cassert>
@@ -13663,17 +13664,20 @@ QualType ASTContext::getCorrespondingSignedFixedPointType(QualType Ty) const {
   }
 }
 
-std::vector<std::string> ASTContext::filterFunctionTargetVersionAttrs(
-    const TargetVersionAttr *TV) const {
-  assert(TV != nullptr);
-  llvm::SmallVector<StringRef, 8> Feats;
-  std::vector<std::string> ResFeats;
-  TV->getFeatures(Feats);
-  for (auto &Feature : Feats)
-    if (Target->validateCpuSupports(Feature.str()))
-      // Use '?' to mark features that came from TargetVersion.
-      ResFeats.push_back("?" + Feature.str());
-  return ResFeats;
+// Given a list of FMV features, return a concatenated list of the
+// corresponding backend features (which may contain duplicates).
+static std::vector<std::string> getFMVBackendFeaturesFor(
+    const llvm::SmallVectorImpl<StringRef> &FMVFeatStrings) {
+  std::vector<std::string> BackendFeats;
+  for (StringRef F : FMVFeatStrings) {
+    if (auto FMVExt = llvm::AArch64::parseArchExtension(F)) {
+      SmallVector<StringRef, 8> Feats;
+      FMVExt->DependentFeatures.split(Feats, ',', -1, false);
+      for (StringRef F : Feats)
+        BackendFeats.push_back(F.str());
+    }
+  }
+  return BackendFeats;
 }
 
 ParsedTargetAttr
@@ -13708,10 +13712,12 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
 
     // Make a copy of the features as passed on the command line into the
     // beginning of the additional features from the function to override.
-    ParsedAttr.Features.insert(
-        ParsedAttr.Features.begin(),
-        Target->getTargetOpts().FeaturesAsWritten.begin(),
-        Target->getTargetOpts().FeaturesAsWritten.end());
+    // AArch64 handles command line option features in parseTargetAttr().
+    if (!Target->getTriple().isAArch64())
+      ParsedAttr.Features.insert(
+          ParsedAttr.Features.begin(),
+          Target->getTargetOpts().FeaturesAsWritten.begin(),
+          Target->getTargetOpts().FeaturesAsWritten.end());
 
     if (ParsedAttr.CPU != "" && Target->isValidCPUName(ParsedAttr.CPU))
       TargetCPU = ParsedAttr.CPU;
@@ -13732,32 +13738,31 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
                     Target->getTargetOpts().FeaturesAsWritten.end());
     Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
   } else if (const auto *TC = FD->getAttr<TargetClonesAttr>()) {
-    std::vector<std::string> Features;
     if (Target->getTriple().isAArch64()) {
-      // TargetClones for AArch64
       llvm::SmallVector<StringRef, 8> Feats;
       TC->getFeatures(Feats, GD.getMultiVersionIndex());
-      for (StringRef Feat : Feats)
-        if (Target->validateCpuSupports(Feat.str()))
-          // Use '?' to mark features that came from AArch64 TargetClones.
-          Features.push_back("?" + Feat.str());
+      std::vector<std::string> Features = getFMVBackendFeaturesFor(Feats);
       Features.insert(Features.begin(),
                       Target->getTargetOpts().FeaturesAsWritten.begin(),
                       Target->getTargetOpts().FeaturesAsWritten.end());
+      Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
     } else {
+      std::vector<std::string> Features;
       StringRef VersionStr = TC->getFeatureStr(GD.getMultiVersionIndex());
       if (VersionStr.starts_with("arch="))
         TargetCPU = VersionStr.drop_front(sizeof("arch=") - 1);
       else if (VersionStr != "default")
         Features.push_back((StringRef{"+"} + VersionStr).str());
+      Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
     }
-    Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
   } else if (const auto *TV = FD->getAttr<TargetVersionAttr>()) {
-    std::vector<std::string> Feats = filterFunctionTargetVersionAttrs(TV);
-    Feats.insert(Feats.begin(),
-                 Target->getTargetOpts().FeaturesAsWritten.begin(),
-                 Target->getTargetOpts().FeaturesAsWritten.end());
-    Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Feats);
+    llvm::SmallVector<StringRef, 8> Feats;
+    TV->getFeatures(Feats);
+    std::vector<std::string> Features = getFMVBackendFeaturesFor(Feats);
+    Features.insert(Features.begin(),
+                    Target->getTargetOpts().FeaturesAsWritten.begin(),
+                    Target->getTargetOpts().FeaturesAsWritten.end());
+    Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
   } else {
     FeatureMap = Target->getTargetOpts().FeatureMap;
   }
diff --git a/clang/lib/AST/ASTDumper.cpp b/clang/lib/AST/ASTDumper.cpp
index c8973fd..f060388 100644
--- a/clang/lib/AST/ASTDumper.cpp
+++ b/clang/lib/AST/ASTDumper.cpp
@@ -360,3 +360,37 @@ LLVM_DUMP_METHOD void ConceptReference::dump(raw_ostream &OS) const {
   ASTDumper P(OS, Ctx, Ctx.getDiagnostics().getShowColors());
   P.Visit(this);
 }
+
+//===----------------------------------------------------------------------===//
+// TemplateName method implementations
+//===----------------------------------------------------------------------===//
+
+// FIXME: These are actually using the TemplateArgument dumper, through
+// an implicit conversion. The dump will claim this is a template argument,
+// which is misleading.
+
+LLVM_DUMP_METHOD void TemplateName::dump() const {
+  ASTDumper Dumper(llvm::errs(), /*ShowColors=*/false);
+  Dumper.Visit(*this);
+}
+
+LLVM_DUMP_METHOD void TemplateName::dump(llvm::raw_ostream &OS,
+                                         const ASTContext &Context) const {
+  ASTDumper Dumper(OS, Context, Context.getDiagnostics().getShowColors());
+  Dumper.Visit(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// TemplateArgument method implementations
+//===----------------------------------------------------------------------===//
+
+LLVM_DUMP_METHOD void TemplateArgument::dump() const {
+  ASTDumper Dumper(llvm::errs(), /*ShowColors=*/false);
+  Dumper.Visit(*this);
+}
+
+LLVM_DUMP_METHOD void TemplateArgument::dump(llvm::raw_ostream &OS,
+                                             const ASTContext &Context) const {
+  ASTDumper Dumper(OS, Context, Context.getDiagnostics().getShowColors());
+  Dumper.Visit(*this);
+}
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index a5d3dac..0328666 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -139,4 +139,6 @@ add_clang_library(clangAST
   omp_gen
   ClangDriverOptions
   intrinsics_gen
+  # These generated headers are included transitively.
+  AArch64TargetParserTableGen
   )
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index f1aa19e..86fb396 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -15209,11 +15209,21 @@ bool ComplexExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
       APFloat &ResI = Result.getComplexFloatImag();
       if (LHSReal) {
         assert(!RHSReal && "Cannot have two real operands for a complex op!");
-        ResR = A * C;
-        ResI = A * D;
+        ResR = A;
+        ResI = A;
+        // ResR = A * C;
+        // ResI = A * D;
+        if (!handleFloatFloatBinOp(Info, E, ResR, BO_Mul, C) ||
+            !handleFloatFloatBinOp(Info, E, ResI, BO_Mul, D))
+          return false;
       } else if (RHSReal) {
-        ResR = C * A;
-        ResI = C * B;
+        // ResR = C * A;
+        // ResI = C * B;
+        ResR = C;
+        ResI = C;
+        if (!handleFloatFloatBinOp(Info, E, ResR, BO_Mul, A) ||
+            !handleFloatFloatBinOp(Info, E, ResI, BO_Mul, B))
+          return false;
       } else {
         // In the fully general case, we need to handle NaNs and infinities
         // robustly.
@@ -15289,8 +15299,13 @@ bool ComplexExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
       APFloat &ResR = Result.getComplexFloatReal();
       APFloat &ResI = Result.getComplexFloatImag();
       if (RHSReal) {
-        ResR = A / C;
-        ResI = B / C;
+        ResR = A;
+        ResI = B;
+        // ResR = A / C;
+        // ResI = B / C;
+        if (!handleFloatFloatBinOp(Info, E, ResR, BO_Div, C) ||
+            !handleFloatFloatBinOp(Info, E, ResI, BO_Div, C))
+          return false;
       } else {
         if (LHSReal) {
           // No real optimizations we can do here, stub out with zero.
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index ff2b51e..6654a27 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -325,8 +325,11 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
 
     assert(isPtrType(*FromT));
     assert(isPtrType(*ToT));
-    if (FromT == ToT)
+    if (FromT == ToT) {
+      if (SubExpr->getType()->isVoidPointerType())
+        return this->visit(SubExpr) && this->emitVoidPtrCast(CE);
       return this->delegate(SubExpr);
+    }
 
     if (!this->visit(SubExpr))
       return false;
@@ -1834,6 +1837,9 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
   std::optional<PrimType> RT = classify(RHS->getType());
   std::optional<PrimType> ResultT = classify(E->getType());
 
+  if (!Ctx.getLangOpts().CPlusPlus14)
+    return this->visit(RHS) && this->visit(LHS) && this->emitError(E);
+
   if (!LT || !RT || !ResultT || !LHSComputationT)
     return false;
 
@@ -3827,6 +3833,21 @@ bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
     // we sometimes have to do the lvalue-to-rvalue conversion here manually.
     return this->emitArrayElemPop(classifyPrim(E->getType()), 1, E);
 
+  case UO_Not: // ~x
+    if (!this->visit(SubExpr))
+      return false;
+    // Negate the imaginary component.
+    if (!this->emitArrayElem(ElemT, 1, E))
+      return false;
+    if (!this->emitNeg(ElemT, E))
+      return false;
+    if (!this->emitInitElem(ElemT, 1, E))
+      return false;
+    return DiscardResult ? this->emitPopPtr(E) : true;
+
+  case UO_Extension:
+    return this->delegate(SubExpr);
+
   default:
     return this->emitInvalid(E);
   }
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index f6191d8..025b46b 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -40,7 +40,7 @@ void EvalEmitter::cleanup() { S.cleanup(); }
 EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
                                             bool ConvertResultToRValue) {
   S.setEvalLocation(E->getExprLoc());
-  this->ConvertResultToRValue = ConvertResultToRValue;
+  this->ConvertResultToRValue = ConvertResultToRValue && !isa<ConstantExpr>(E);
   this->CheckFullyInitialized = isa<ConstantExpr>(E);
   EvalResult.setSource(E);
 
@@ -56,10 +56,14 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
 EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD,
                                             bool CheckFullyInitialized) {
   this->CheckFullyInitialized = CheckFullyInitialized;
-  this->ConvertResultToRValue =
-      VD->getAnyInitializer() &&
-      (VD->getAnyInitializer()->getType()->isAnyComplexType() ||
-       VD->getAnyInitializer()->getType()->isVectorType());
+
+  if (const Expr *Init = VD->getAnyInitializer()) {
+    QualType T = VD->getType();
+    this->ConvertResultToRValue = !Init->isGLValue() && !T->isPointerType() &&
+                                  !T->isObjCObjectPointerType();
+  } else
+    this->ConvertResultToRValue = false;
+
   EvalResult.setSource(VD);
 
   if (!this->visitDecl(VD) && EvalResult.empty())
@@ -138,6 +142,10 @@ template <> bool EvalEmitter::emitRet<PT_Ptr>(const SourceInfo &Info) {
     return true;
 
   const Pointer &Ptr = S.Stk.pop<Pointer>();
+
+  if (CheckFullyInitialized && !EvalResult.checkFullyInitialized(S, Ptr))
+    return false;
+
   // Implicitly convert lvalue to rvalue, if requested.
   if (ConvertResultToRValue) {
     if (std::optional<APValue> V = Ptr.toRValue(Ctx)) {
@@ -146,17 +154,7 @@ template <> bool EvalEmitter::emitRet<PT_Ptr>(const SourceInfo &Info) {
       return false;
     }
   } else {
-    if (CheckFullyInitialized) {
-      if (!EvalResult.checkFullyInitialized(S, Ptr))
-        return false;
-
-      std::optional<APValue> RValueResult = Ptr.toRValue(Ctx);
-      if (!RValueResult)
-        return false;
-      EvalResult.setValue(*RValueResult);
-    } else {
-      EvalResult.setValue(Ptr.toAPValue());
-    }
+    EvalResult.setValue(Ptr.toAPValue());
   }
 
   return true;
diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
index 2997723..387e3dc 100644
--- a/clang/lib/AST/Interp/EvaluationResult.cpp
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -151,9 +151,12 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S,
 
   if (const Record *R = Ptr.getRecord())
     return CheckFieldsInitialized(S, InitLoc, Ptr, R);
-  const auto *CAT =
-      cast<ConstantArrayType>(Ptr.getType()->getAsArrayTypeUnsafe());
-  return CheckArrayInitialized(S, InitLoc, Ptr, CAT);
+
+  if (const auto *CAT = dyn_cast_if_present<ConstantArrayType>(
+          Ptr.getType()->getAsArrayTypeUnsafe()))
+    return CheckArrayInitialized(S, InitLoc, Ptr, CAT);
+
+  return true;
 }
 
 } // namespace interp
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index f63711d..0ad710c 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1937,6 +1937,9 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastPointerIntegral(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (Ptr.isDummy())
+    return false;
+
   const SourceInfo &E = S.Current->getSource(OpPC);
   S.CCEDiag(E, diag::note_constexpr_invalid_cast)
       << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
@@ -1949,6 +1952,9 @@ static inline bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC,
                                          uint32_t BitWidth) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (Ptr.isDummy())
+    return false;
+
   const SourceInfo &E = S.Current->getSource(OpPC);
   S.CCEDiag(E, diag::note_constexpr_invalid_cast)
       << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
@@ -1962,6 +1968,9 @@ static inline bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC,
                                           uint32_t BitWidth) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (Ptr.isDummy())
+    return false;
+
   const SourceInfo &E = S.Current->getSource(OpPC);
   S.CCEDiag(E, diag::note_constexpr_invalid_cast)
       << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
@@ -1971,6 +1980,13 @@ static inline bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static inline bool VoidPtrCast(InterpState &S, CodePtr OpPC) {
+  const SourceInfo &E = S.Current->getSource(OpPC);
+  S.CCEDiag(E, diag::note_constexpr_invalid_cast)
+      << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Zero, Nullptr
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index a5ac820..ac5426c 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -665,6 +665,7 @@ def CastPointerIntegralAPS : Opcode {
   let HasGroup = 0;
   let Args = [ArgUint32];
 }
+def VoidPtrCast : Opcode;
 
 def DecayPtr : Opcode {
   let Types = [PtrTypeClass, PtrTypeClass];
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 46f7b79..2e6839e 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -577,15 +577,6 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out,
   }
 }
 
-void TemplateArgument::dump(raw_ostream &Out) const {
-  LangOptions LO; // FIXME! see also TemplateName::dump().
-  LO.CPlusPlus = true;
-  LO.Bool = true;
-  print(PrintingPolicy(LO), Out, /*IncludeType*/ true);
-}
-
-LLVM_DUMP_METHOD void TemplateArgument::dump() const { dump(llvm::errs()); }
-
 //===----------------------------------------------------------------------===//
 // TemplateArgumentLoc Implementation
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp
index 4fc25cb..11544db 100644
--- a/clang/lib/AST/TemplateName.cpp
+++ b/clang/lib/AST/TemplateName.cpp
@@ -360,14 +360,3 @@ const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB,
   OS.flush();
   return DB << NameStr;
 }
-
-void TemplateName::dump(raw_ostream &OS) const {
-  LangOptions LO;  // FIXME!
-  LO.CPlusPlus = true;
-  LO.Bool = true;
-  print(OS, PrintingPolicy(LO));
-}
-
-LLVM_DUMP_METHOD void TemplateName::dump() const {
-  dump(llvm::errs());
-}
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 0db16d4..6fba5ff 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -286,7 +286,6 @@ void AArch64TargetInfo::getTargetDefinesARMV84A(const LangOptions &Opts,
 void AArch64TargetInfo::getTargetDefinesARMV85A(const LangOptions &Opts,
                                                 MacroBuilder &Builder) const {
   Builder.defineMacro("__ARM_FEATURE_FRINT", "1");
-  Builder.defineMacro("__ARM_FEATURE_BTI", "1");
   // Also include the Armv8.4 defines
   getTargetDefinesARMV84A(Opts, Builder);
 }
@@ -499,6 +498,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasPAuthLR)
     Builder.defineMacro("__ARM_FEATURE_PAUTH_LR", "1");
 
+  if (HasBTI)
+    Builder.defineMacro("__ARM_FEATURE_BTI", "1");
+
   if (HasUnalignedAccess)
     Builder.defineMacro("__ARM_FEATURE_UNALIGNED", "1");
 
@@ -1050,57 +1052,18 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
   return true;
 }
 
-bool AArch64TargetInfo::initFeatureMap(
-    llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
-    const std::vector<std::string> &FeaturesVec) const {
-  std::vector<std::string> UpdatedFeaturesVec;
-  // Parse the CPU and add any implied features.
-  std::optional<llvm::AArch64::CpuInfo> CpuInfo = llvm::AArch64::parseCpu(CPU);
-  if (CpuInfo) {
-    auto Exts = CpuInfo->getImpliedExtensions();
-    std::vector<StringRef> CPUFeats;
-    llvm::AArch64::getExtensionFeatures(Exts, CPUFeats);
-    for (auto F : CPUFeats) {
-      assert((F[0] == '+' || F[0] == '-') && "Expected +/- in target feature!");
-      UpdatedFeaturesVec.push_back(F.str());
-    }
-  }
-
-  // Process target and dependent features. This is done in two loops collecting
-  // them into UpdatedFeaturesVec: first to add dependent '+'features, second to
-  // add target '+/-'features that can later disable some of features added on
-  // the first loop. Function Multi Versioning features begin with '?'.
-  for (const auto &Feature : FeaturesVec)
-    if (((Feature[0] == '?' || Feature[0] == '+')) &&
-        AArch64TargetInfo::doesFeatureAffectCodeGen(Feature.substr(1))) {
-      StringRef DepFeatures =
-          AArch64TargetInfo::getFeatureDependencies(Feature.substr(1));
-      SmallVector<StringRef, 1> AttrFeatures;
-      DepFeatures.split(AttrFeatures, ",");
-      for (auto F : AttrFeatures)
-        UpdatedFeaturesVec.push_back(F.str());
-    }
-  for (const auto &Feature : FeaturesVec)
-    if (Feature[0] != '?') {
-      std::string UpdatedFeature = Feature;
-      if (Feature[0] == '+') {
-        std::optional<llvm::AArch64::ExtensionInfo> Extension =
-          llvm::AArch64::parseArchExtension(Feature.substr(1));
-        if (Extension)
-          UpdatedFeature = Extension->Feature.str();
-      }
-      UpdatedFeaturesVec.push_back(UpdatedFeature);
-    }
-
-  return TargetInfo::initFeatureMap(Features, Diags, CPU, UpdatedFeaturesVec);
-}
-
 // Parse AArch64 Target attributes, which are a comma separated list of:
 //  "arch=<arch>" - parsed to features as per -march=..
 //  "cpu=<cpu>" - parsed to features as per -mcpu=.., with CPU set to <cpu>
 //  "tune=<cpu>" - TuneCPU set to <cpu>
 //  "feature", "no-feature" - Add (or remove) feature.
 //  "+feature", "+nofeature" - Add (or remove) feature.
+//
+// A feature may correspond to an Extension (anything with a corresponding
+// AEK_), in which case an ExtensionSet is used to parse it and expand its
+// dependencies. Otherwise the feature is passed through (e.g. +v8.1a,
+// +outline-atomics, -fmv, etc). Features coming from the command line are
+// already parsed, therefore their dependencies do not need expansion.
 ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
   ParsedTargetAttr Ret;
   if (Features == "default")
@@ -1110,23 +1073,26 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
   bool FoundArch = false;
 
   auto SplitAndAddFeatures = [](StringRef FeatString,
-                                std::vector<std::string> &Features) {
+                                std::vector<std::string> &Features,
+                                llvm::AArch64::ExtensionSet &FeatureBits) {
     SmallVector<StringRef, 8> SplitFeatures;
     FeatString.split(SplitFeatures, StringRef("+"), -1, false);
     for (StringRef Feature : SplitFeatures) {
-      StringRef FeatureName = llvm::AArch64::getArchExtFeature(Feature);
-      if (!FeatureName.empty())
-        Features.push_back(FeatureName.str());
+      if (FeatureBits.parseModifier(Feature, /* AllowNoDashForm = */ true))
+        continue;
+      // Pass through features that are not extensions, e.g. +v8.1a,
+      // +outline-atomics, -fmv, etc.
+      if (Feature.starts_with("no"))
+        Features.push_back("-" + Feature.drop_front(2).str());
       else
-        // Pushing the original feature string to give a sema error later on
-        // when they get checked.
-        if (Feature.starts_with("no"))
-          Features.push_back("-" + Feature.drop_front(2).str());
-        else
-          Features.push_back("+" + Feature.str());
+        Features.push_back("+" + Feature.str());
     }
   };
 
+  llvm::AArch64::ExtensionSet FeatureBits;
+  // Reconstruct the bitset from the command line option features.
+  FeatureBits.reconstructFromParsedFeatures(getTargetOpts().FeaturesAsWritten);
+
   for (auto &Feature : AttrFeatures) {
     Feature = Feature.trim();
     if (Feature.starts_with("fpmath="))
@@ -1149,9 +1115,9 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
       // Ret.Features.
       if (!AI)
         continue;
-      Ret.Features.push_back(AI->ArchFeature.str());
+      FeatureBits.addArchDefaults(*AI);
       // Add any extra features, after the +
-      SplitAndAddFeatures(Split.second, Ret.Features);
+      SplitAndAddFeatures(Split.second, Ret.Features, FeatureBits);
     } else if (Feature.starts_with("cpu=")) {
       if (!Ret.CPU.empty())
         Ret.Duplicate = "cpu=";
@@ -1161,7 +1127,10 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
         std::pair<StringRef, StringRef> Split =
             Feature.split("=").second.trim().split("+");
         Ret.CPU = Split.first;
-        SplitAndAddFeatures(Split.second, Ret.Features);
+        if (auto CpuInfo = llvm::AArch64::parseCpu(Ret.CPU)) {
+          FeatureBits.addCPUDefaults(*CpuInfo);
+          SplitAndAddFeatures(Split.second, Ret.Features, FeatureBits);
+        }
       }
     } else if (Feature.starts_with("tune=")) {
       if (!Ret.Tune.empty())
@@ -1169,25 +1138,19 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
       else
         Ret.Tune = Feature.split("=").second.trim();
     } else if (Feature.starts_with("+")) {
-      SplitAndAddFeatures(Feature, Ret.Features);
-    } else if (Feature.starts_with("no-")) {
-      StringRef FeatureName =
-          llvm::AArch64::getArchExtFeature(Feature.split("-").second);
-      if (!FeatureName.empty())
-        Ret.Features.push_back("-" + FeatureName.drop_front(1).str());
-      else
-        Ret.Features.push_back("-" + Feature.split("-").second.str());
+      SplitAndAddFeatures(Feature, Ret.Features, FeatureBits);
     } else {
-      // Try parsing the string to the internal target feature name. If it is
-      // invalid, add the original string (which could already be an internal
-      // name). These should be checked later by isValidFeatureName.
-      StringRef FeatureName = llvm::AArch64::getArchExtFeature(Feature);
-      if (!FeatureName.empty())
-        Ret.Features.push_back(FeatureName.str());
+      if (FeatureBits.parseModifier(Feature, /* AllowNoDashForm = */ true))
+        continue;
+      // Pass through features that are not extensions, e.g. +v8.1a,
+      // +outline-atomics, -fmv, etc.
+      if (Feature.starts_with("no-"))
+        Ret.Features.push_back("-" + Feature.drop_front(3).str());
       else
         Ret.Features.push_back("+" + Feature.str());
     }
   }
+  FeatureBits.toLLVMFeatureList(Ret.Features);
   return Ret;
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 12fb502..696553e 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -107,10 +107,6 @@ public:
   unsigned multiVersionSortPriority(StringRef Name) const override;
   unsigned multiVersionFeatureCost() const override;
 
-  bool
-  initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
-                 StringRef CPU,
-                 const std::vector<std::string> &FeaturesVec) const override;
   bool useFP16ConversionIntrinsics() const override {
     return false;
   }
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 681a475..11e2d54 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -5766,28 +5766,16 @@ void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder,
   // it is loaded upon use, so we identify such pattern here.
   if (llvm::LoadInst *Load = dyn_cast<llvm::LoadInst>(Value)) {
     llvm::Value *Var = Load->getPointerOperand();
-    if (llvm::Metadata *MDValue = llvm::ValueAsMetadata::getIfExists(Var)) {
-      if (llvm::Value *DbgValue = llvm::MetadataAsValue::getIfExists(
-              CGM.getLLVMContext(), MDValue)) {
-        for (llvm::User *U : DbgValue->users()) {
-          if (llvm::CallInst *DbgDeclare = dyn_cast<llvm::CallInst>(U)) {
-            if (DbgDeclare->getCalledFunction()->getIntrinsicID() ==
-                    llvm::Intrinsic::dbg_declare &&
-                DbgDeclare->getArgOperand(0) == DbgValue) {
-              // There can be implicit type cast applied on a variable if it is
-              // an opaque ptr, in this case its debug info may not match the
-              // actual type of object being used as in the next instruction, so
-              // we will need to emit a pseudo variable for type-casted value.
-              llvm::DILocalVariable *MDNode = cast<llvm::DILocalVariable>(
-                  cast<llvm::MetadataAsValue>(DbgDeclare->getOperand(1))
-                      ->getMetadata());
-              if (MDNode->getType() == Type)
-                return;
-            }
-          }
-        }
-      }
-    }
+    // There can be implicit type cast applied on a variable if it is an opaque
+    // ptr, in this case its debug info may not match the actual type of object
+    // being used as in the next instruction, so we will need to emit a pseudo
+    // variable for type-casted value.
+    auto DeclareTypeMatches = [&](auto *DbgDeclare) {
+      return DbgDeclare->getVariable()->getType() == Type;
+    };
+    if (any_of(llvm::findDbgDeclares(Var), DeclareTypeMatches) ||
+        any_of(llvm::findDVRDeclares(Var), DeclareTypeMatches))
+      return;
   }
 
   // Find the correct location to insert a sequence of instructions to
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 5b2039a..b2a5cee 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -513,15 +513,6 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
 
   QualType elementType =
       CGF.getContext().getAsArrayType(ArrayQTy)->getElementType();
-
-  // DestPtr is an array*.  Construct an elementType* by drilling
-  // down a level.
-  llvm::Value *zero = llvm::ConstantInt::get(CGF.SizeTy, 0);
-  llvm::Value *indices[] = { zero, zero };
-  llvm::Value *begin = Builder.CreateInBoundsGEP(DestPtr.getElementType(),
-                                                 DestPtr.emitRawPointer(CGF),
-                                                 indices, "arrayinit.begin");
-
   CharUnits elementSize = CGF.getContext().getTypeSizeInChars(elementType);
   CharUnits elementAlign =
     DestPtr.getAlignment().alignmentOfArrayElement(elementSize);
@@ -562,6 +553,7 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
   Address endOfInit = Address::invalid();
   CodeGenFunction::CleanupDeactivationScope deactivation(CGF);
 
+  llvm::Value *begin = DestPtr.emitRawPointer(CGF);
   if (dtorKind) {
     CodeGenFunction::AllocaTrackerRAII allocaTracker(CGF);
     // In principle we could tell the cleanup where we are more
@@ -585,19 +577,13 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
 
   llvm::Value *one = llvm::ConstantInt::get(CGF.SizeTy, 1);
 
-  // The 'current element to initialize'.  The invariants on this
-  // variable are complicated.  Essentially, after each iteration of
-  // the loop, it points to the last initialized element, except
-  // that it points to the beginning of the array before any
-  // elements have been initialized.
-  llvm::Value *element = begin;
-
   // Emit the explicit initializers.
   for (uint64_t i = 0; i != NumInitElements; ++i) {
-    // Advance to the next element.
+    llvm::Value *element = begin;
     if (i > 0) {
-      element = Builder.CreateInBoundsGEP(
-          llvmElementType, element, one, "arrayinit.element");
+      element = Builder.CreateInBoundsGEP(llvmElementType, begin,
+                                          llvm::ConstantInt::get(CGF.SizeTy, i),
+                                          "arrayinit.element");
 
       // Tell the cleanup that it needs to destroy up to this
       // element.  TODO: some of these stores can be trivially
@@ -624,9 +610,12 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
     //   do { *array++ = filler; } while (array != end);
 
     // Advance to the start of the rest of the array.
+    llvm::Value *element = begin;
     if (NumInitElements) {
       element = Builder.CreateInBoundsGEP(
-          llvmElementType, element, one, "arrayinit.start");
+          llvmElementType, element,
+          llvm::ConstantInt::get(CGF.SizeTy, NumInitElements),
+          "arrayinit.start");
       if (endOfInit.isValid()) Builder.CreateStore(element, endOfInit);
     }
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 5e6a3dd..55ba21a 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -313,7 +313,7 @@ void clang::CodeGen::CGHLSLRuntime::setHLSLEntryAttributes(
   assert(ShaderAttr && "All entry functions must have a HLSLShaderAttr");
   const StringRef ShaderAttrKindStr = "hlsl.shader";
   Fn->addFnAttr(ShaderAttrKindStr,
-                ShaderAttr->ConvertShaderTypeToStr(ShaderAttr->getType()));
+                llvm::Triple::getEnvironmentTypeName(ShaderAttr->getType()));
   if (HLSLNumThreadsAttr *NumThreadsAttr = FD->getAttr<HLSLNumThreadsAttr>()) {
     const StringRef NumThreadsKindStr = "hlsl.numthreads";
     std::string NumThreadsStr =
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 41ac511..39222c0 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -414,7 +414,8 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
     CGM.ErrorUnsupported(S, "OpenMP dispatch directive");
     break;
   case Stmt::OMPScopeDirectiveClass:
-    llvm_unreachable("scope not supported with FE outlining");
+    CGM.ErrorUnsupported(S, "scope with FE outlining");
+    break;
   case Stmt::OMPMaskedDirectiveClass:
     EmitOMPMaskedDirective(cast<OMPMaskedDirective>(*S));
     break;
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index be684ac..b073604 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1257,6 +1257,11 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
     }
     return CurrentState.Indent;
   }
+  if (Current.is(TT_TrailingReturnArrow) &&
+      Previous.isOneOf(tok::kw_noexcept, tok::kw_mutable, tok::kw_constexpr,
+                       tok::kw_consteval, tok::kw_static, TT_AttributeSquare)) {
+    return ContinuationIndent;
+  }
   if ((Current.isOneOf(tok::r_brace, tok::r_square) ||
        (Current.is(tok::greater) && (Style.isProto() || Style.isTableGen()))) &&
       State.Stack.size() > 1) {
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 80ea43d..580b987 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -1018,24 +1018,33 @@ void SemaCUDA::checkTargetOverload(FunctionDecl *NewFD,
     // HD/global functions "exist" in some sense on both the host and device, so
     // should have the same implementation on both sides.
     if (NewTarget != OldTarget &&
-        ((NewTarget == CUDAFunctionTarget::HostDevice &&
-          !(getLangOpts().OffloadImplicitHostDeviceTemplates &&
-            isImplicitHostDeviceFunction(NewFD) &&
-            OldTarget == CUDAFunctionTarget::Device)) ||
-         (OldTarget == CUDAFunctionTarget::HostDevice &&
-          !(getLangOpts().OffloadImplicitHostDeviceTemplates &&
-            isImplicitHostDeviceFunction(OldFD) &&
-            NewTarget == CUDAFunctionTarget::Device)) ||
-         (NewTarget == CUDAFunctionTarget::Global) ||
-         (OldTarget == CUDAFunctionTarget::Global)) &&
         !SemaRef.IsOverload(NewFD, OldFD, /* UseMemberUsingDeclRules = */ false,
                             /* ConsiderCudaAttrs = */ false)) {
-      Diag(NewFD->getLocation(), diag::err_cuda_ovl_target)
-          << llvm::to_underlying(NewTarget) << NewFD->getDeclName()
-          << llvm::to_underlying(OldTarget) << OldFD;
-      Diag(OldFD->getLocation(), diag::note_previous_declaration);
-      NewFD->setInvalidDecl();
-      break;
+      if ((NewTarget == CUDAFunctionTarget::HostDevice &&
+           !(getLangOpts().OffloadImplicitHostDeviceTemplates &&
+             isImplicitHostDeviceFunction(NewFD) &&
+             OldTarget == CUDAFunctionTarget::Device)) ||
+          (OldTarget == CUDAFunctionTarget::HostDevice &&
+           !(getLangOpts().OffloadImplicitHostDeviceTemplates &&
+             isImplicitHostDeviceFunction(OldFD) &&
+             NewTarget == CUDAFunctionTarget::Device)) ||
+          (NewTarget == CUDAFunctionTarget::Global) ||
+          (OldTarget == CUDAFunctionTarget::Global)) {
+        Diag(NewFD->getLocation(), diag::err_cuda_ovl_target)
+            << llvm::to_underlying(NewTarget) << NewFD->getDeclName()
+            << llvm::to_underlying(OldTarget) << OldFD;
+        Diag(OldFD->getLocation(), diag::note_previous_declaration);
+        NewFD->setInvalidDecl();
+        break;
+      }
+      if ((NewTarget == CUDAFunctionTarget::Host &&
+           OldTarget == CUDAFunctionTarget::Device) ||
+          (NewTarget == CUDAFunctionTarget::Device &&
+           OldTarget == CUDAFunctionTarget::Host)) {
+        Diag(NewFD->getLocation(), diag::warn_offload_incompatible_redeclare)
+            << llvm::to_underlying(NewTarget) << llvm::to_underlying(OldTarget);
+        Diag(OldFD->getLocation(), diag::note_previous_declaration);
+      }
     }
   }
 }
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index e224e79..37f0df2 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -1806,6 +1806,7 @@ static unsigned getRecordDiagFromTagKind(TagTypeKind Tag) {
 static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl,
                                        Stmt *Body,
                                        Sema::CheckConstexprKind Kind);
+static bool CheckConstexprMissingReturn(Sema &SemaRef, const FunctionDecl *Dcl);
 
 // Check whether a function declaration satisfies the requirements of a
 // constexpr function definition or a constexpr constructor definition. If so,
@@ -2411,20 +2412,9 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl,
     }
   } else {
     if (ReturnStmts.empty()) {
-      // C++1y doesn't require constexpr functions to contain a 'return'
-      // statement. We still do, unless the return type might be void, because
-      // otherwise if there's no return statement, the function cannot
-      // be used in a core constant expression.
-      bool OK = SemaRef.getLangOpts().CPlusPlus14 &&
-                (Dcl->getReturnType()->isVoidType() ||
-                 Dcl->getReturnType()->isDependentType());
       switch (Kind) {
       case Sema::CheckConstexprKind::Diagnose:
-        SemaRef.Diag(Dcl->getLocation(),
-                     OK ? diag::warn_cxx11_compat_constexpr_body_no_return
-                        : diag::err_constexpr_body_no_return)
-            << Dcl->isConsteval();
-        if (!OK)
+        if (!CheckConstexprMissingReturn(SemaRef, Dcl))
           return false;
         break;
 
@@ -2494,6 +2484,28 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl,
   return true;
 }
 
+static bool CheckConstexprMissingReturn(Sema &SemaRef,
+                                        const FunctionDecl *Dcl) {
+  bool IsVoidOrDependentType = Dcl->getReturnType()->isVoidType() ||
+                               Dcl->getReturnType()->isDependentType();
+  // Skip emitting a missing return error diagnostic for non-void functions
+  // since C++23 no longer mandates constexpr functions to yield constant
+  // expressions.
+  if (SemaRef.getLangOpts().CPlusPlus23 && !IsVoidOrDependentType)
+    return true;
+
+  // C++14 doesn't require constexpr functions to contain a 'return'
+  // statement. We still do, unless the return type might be void, because
+  // otherwise if there's no return statement, the function cannot
+  // be used in a core constant expression.
+  bool OK = SemaRef.getLangOpts().CPlusPlus14 && IsVoidOrDependentType;
+  SemaRef.Diag(Dcl->getLocation(),
+               OK ? diag::warn_cxx11_compat_constexpr_body_no_return
+                  : diag::err_constexpr_body_no_return)
+      << Dcl->isConsteval();
+  return OK;
+}
+
 bool Sema::CheckImmediateEscalatingFunctionDefinition(
     FunctionDecl *FD, const sema::FunctionScopeInfo *FSI) {
   if (!getLangOpts().CPlusPlus20 || !FD->isImmediateEscalating())
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 0a2face..144cdcc 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -146,7 +146,7 @@ HLSLNumThreadsAttr *SemaHLSL::mergeNumThreadsAttr(Decl *D,
 
 HLSLShaderAttr *
 SemaHLSL::mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
-                          HLSLShaderAttr::ShaderType ShaderType) {
+                          llvm::Triple::EnvironmentType ShaderType) {
   if (HLSLShaderAttr *NT = D->getAttr<HLSLShaderAttr>()) {
     if (NT->getType() != ShaderType) {
       Diag(NT->getLocation(), diag::err_hlsl_attribute_param_mismatch) << AL;
@@ -184,13 +184,12 @@ void SemaHLSL::ActOnTopLevelFunction(FunctionDecl *FD) {
   if (FD->getName() != TargetInfo.getTargetOpts().HLSLEntry)
     return;
 
-  StringRef Env = TargetInfo.getTriple().getEnvironmentName();
-  HLSLShaderAttr::ShaderType ShaderType;
-  if (HLSLShaderAttr::ConvertStrToShaderType(Env, ShaderType)) {
+  llvm::Triple::EnvironmentType Env = TargetInfo.getTriple().getEnvironment();
+  if (HLSLShaderAttr::isValidShaderType(Env) && Env != llvm::Triple::Library) {
     if (const auto *Shader = FD->getAttr<HLSLShaderAttr>()) {
       // The entry point is already annotated - check that it matches the
       // triple.
-      if (Shader->getType() != ShaderType) {
+      if (Shader->getType() != Env) {
         Diag(Shader->getLocation(), diag::err_hlsl_entry_shader_attr_mismatch)
             << Shader;
         FD->setInvalidDecl();
@@ -198,11 +197,11 @@ void SemaHLSL::ActOnTopLevelFunction(FunctionDecl *FD) {
     } else {
       // Implicitly add the shader attribute if the entry function isn't
       // explicitly annotated.
-      FD->addAttr(HLSLShaderAttr::CreateImplicit(getASTContext(), ShaderType,
+      FD->addAttr(HLSLShaderAttr::CreateImplicit(getASTContext(), Env,
                                                  FD->getBeginLoc()));
     }
   } else {
-    switch (TargetInfo.getTriple().getEnvironment()) {
+    switch (Env) {
     case llvm::Triple::UnknownEnvironment:
     case llvm::Triple::Library:
       break;
@@ -215,38 +214,40 @@ void SemaHLSL::ActOnTopLevelFunction(FunctionDecl *FD) {
 void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
   const auto *ShaderAttr = FD->getAttr<HLSLShaderAttr>();
   assert(ShaderAttr && "Entry point has no shader attribute");
-  HLSLShaderAttr::ShaderType ST = ShaderAttr->getType();
+  llvm::Triple::EnvironmentType ST = ShaderAttr->getType();
 
   switch (ST) {
-  case HLSLShaderAttr::Pixel:
-  case HLSLShaderAttr::Vertex:
-  case HLSLShaderAttr::Geometry:
-  case HLSLShaderAttr::Hull:
-  case HLSLShaderAttr::Domain:
-  case HLSLShaderAttr::RayGeneration:
-  case HLSLShaderAttr::Intersection:
-  case HLSLShaderAttr::AnyHit:
-  case HLSLShaderAttr::ClosestHit:
-  case HLSLShaderAttr::Miss:
-  case HLSLShaderAttr::Callable:
+  case llvm::Triple::Pixel:
+  case llvm::Triple::Vertex:
+  case llvm::Triple::Geometry:
+  case llvm::Triple::Hull:
+  case llvm::Triple::Domain:
+  case llvm::Triple::RayGeneration:
+  case llvm::Triple::Intersection:
+  case llvm::Triple::AnyHit:
+  case llvm::Triple::ClosestHit:
+  case llvm::Triple::Miss:
+  case llvm::Triple::Callable:
     if (const auto *NT = FD->getAttr<HLSLNumThreadsAttr>()) {
       DiagnoseAttrStageMismatch(NT, ST,
-                                {HLSLShaderAttr::Compute,
-                                 HLSLShaderAttr::Amplification,
-                                 HLSLShaderAttr::Mesh});
+                                {llvm::Triple::Compute,
+                                 llvm::Triple::Amplification,
+                                 llvm::Triple::Mesh});
       FD->setInvalidDecl();
     }
     break;
 
-  case HLSLShaderAttr::Compute:
-  case HLSLShaderAttr::Amplification:
-  case HLSLShaderAttr::Mesh:
+  case llvm::Triple::Compute:
+  case llvm::Triple::Amplification:
+  case llvm::Triple::Mesh:
     if (!FD->hasAttr<HLSLNumThreadsAttr>()) {
       Diag(FD->getLocation(), diag::err_hlsl_missing_numthreads)
-          << HLSLShaderAttr::ConvertShaderTypeToStr(ST);
+          << llvm::Triple::getEnvironmentTypeName(ST);
       FD->setInvalidDecl();
     }
     break;
+  default:
+    llvm_unreachable("Unhandled environment in triple");
   }
 
   for (ParmVarDecl *Param : FD->parameters()) {
@@ -268,14 +269,14 @@ void SemaHLSL::CheckSemanticAnnotation(
     const HLSLAnnotationAttr *AnnotationAttr) {
   auto *ShaderAttr = EntryPoint->getAttr<HLSLShaderAttr>();
   assert(ShaderAttr && "Entry point has no shader attribute");
-  HLSLShaderAttr::ShaderType ST = ShaderAttr->getType();
+  llvm::Triple::EnvironmentType ST = ShaderAttr->getType();
 
   switch (AnnotationAttr->getKind()) {
   case attr::HLSLSV_DispatchThreadID:
   case attr::HLSLSV_GroupIndex:
-    if (ST == HLSLShaderAttr::Compute)
+    if (ST == llvm::Triple::Compute)
       return;
-    DiagnoseAttrStageMismatch(AnnotationAttr, ST, {HLSLShaderAttr::Compute});
+    DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Compute});
     break;
   default:
     llvm_unreachable("Unknown HLSLAnnotationAttr");
@@ -283,16 +284,16 @@ void SemaHLSL::CheckSemanticAnnotation(
 }
 
 void SemaHLSL::DiagnoseAttrStageMismatch(
-    const Attr *A, HLSLShaderAttr::ShaderType Stage,
-    std::initializer_list<HLSLShaderAttr::ShaderType> AllowedStages) {
+    const Attr *A, llvm::Triple::EnvironmentType Stage,
+    std::initializer_list<llvm::Triple::EnvironmentType> AllowedStages) {
   SmallVector<StringRef, 8> StageStrings;
   llvm::transform(AllowedStages, std::back_inserter(StageStrings),
-                  [](HLSLShaderAttr::ShaderType ST) {
+                  [](llvm::Triple::EnvironmentType ST) {
                     return StringRef(
-                        HLSLShaderAttr::ConvertShaderTypeToStr(ST));
+                        HLSLShaderAttr::ConvertEnvironmentTypeToStr(ST));
                   });
   Diag(A->getLoc(), diag::err_hlsl_attr_unsupported_in_stage)
-      << A << HLSLShaderAttr::ConvertShaderTypeToStr(Stage)
+      << A << llvm::Triple::getEnvironmentTypeName(Stage)
       << (AllowedStages.size() != 1) << join(StageStrings, ", ");
 }
 
@@ -430,8 +431,8 @@ void SemaHLSL::handleShaderAttr(Decl *D, const ParsedAttr &AL) {
   if (!SemaRef.checkStringLiteralArgumentAttr(AL, 0, Str, &ArgLoc))
     return;
 
-  HLSLShaderAttr::ShaderType ShaderType;
-  if (!HLSLShaderAttr::ConvertStrToShaderType(Str, ShaderType)) {
+  llvm::Triple::EnvironmentType ShaderType;
+  if (!HLSLShaderAttr::ConvertStrToEnvironmentType(Str, ShaderType)) {
     Diag(AL.getLoc(), diag::warn_attribute_type_not_supported)
         << AL << Str << ArgLoc;
     return;
@@ -549,16 +550,22 @@ class DiagnoseHLSLAvailability
   //
   // Maps FunctionDecl to an unsigned number that represents the set of shader
   // environments the function has been scanned for.
-  // Since HLSLShaderAttr::ShaderType enum is generated from Attr.td and is
-  // defined without any assigned values, it is guaranteed to be numbered
-  // sequentially from 0 up and we can use it to 'index' individual bits
-  // in the set.
+  // The llvm::Triple::EnvironmentType enum values for shader stages guaranteed
+  // to be numbered from llvm::Triple::Pixel to llvm::Triple::Amplification
+  // (verified by static_asserts in Triple.cpp), we can use it to index
+  // individual bits in the set, as long as we shift the values to start with 0
+  // by subtracting the value of llvm::Triple::Pixel first.
+  //
   // The N'th bit in the set will be set if the function has been scanned
-  // in shader environment whose ShaderType integer value equals N.
+  // in shader environment whose llvm::Triple::EnvironmentType integer value
+  // equals (llvm::Triple::Pixel + N).
+  //
   // For example, if a function has been scanned in compute and pixel stage
-  // environment, the value will be 0x21 (100001 binary) because
-  // (int)HLSLShaderAttr::ShaderType::Pixel == 1 and
-  // (int)HLSLShaderAttr::ShaderType::Compute == 5.
+  // environment, the value will be 0x21 (100001 binary) because:
+  //
+  //   (int)(llvm::Triple::Pixel - llvm::Triple::Pixel) == 0
+  //   (int)(llvm::Triple::Compute - llvm::Triple::Pixel) == 5
+  //
   // A FunctionDecl is mapped to 0 (or not included in the map) if it has not
   // been scanned in any environment.
   llvm::DenseMap<const FunctionDecl *, unsigned> ScannedDecls;
@@ -574,12 +581,16 @@ class DiagnoseHLSLAvailability
   bool ReportOnlyShaderStageIssues;
 
   // Helper methods for dealing with current stage context / environment
-  void SetShaderStageContext(HLSLShaderAttr::ShaderType ShaderType) {
+  void SetShaderStageContext(llvm::Triple::EnvironmentType ShaderType) {
     static_assert(sizeof(unsigned) >= 4);
-    assert((unsigned)ShaderType < 31); // 31 is reserved for "unknown"
-
-    CurrentShaderEnvironment = HLSLShaderAttr::getTypeAsEnvironment(ShaderType);
-    CurrentShaderStageBit = (1 << ShaderType);
+    assert(HLSLShaderAttr::isValidShaderType(ShaderType));
+    assert((unsigned)(ShaderType - llvm::Triple::Pixel) < 31 &&
+           "ShaderType is too big for this bitmap"); // 31 is reserved for
+                                                     // "unknown"
+
+    unsigned bitmapIndex = ShaderType - llvm::Triple::Pixel;
+    CurrentShaderEnvironment = ShaderType;
+    CurrentShaderStageBit = (1 << bitmapIndex);
   }
 
   void SetUnknownShaderStageContext() {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 6e68153..5c759ae 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -6198,18 +6198,17 @@ public:
     // unless the assume-no-nested-parallelism flag has been specified.
     // OpenMP API runtime library calls do not inhibit parallel loop
     // translation, regardless of the assume-no-nested-parallelism.
-    if (C) {
-      bool IsOpenMPAPI = false;
-      auto *FD = dyn_cast_or_null<FunctionDecl>(C->getCalleeDecl());
-      if (FD) {
-        std::string Name = FD->getNameInfo().getAsString();
-        IsOpenMPAPI = Name.find("omp_") == 0;
-      }
-      TeamsLoopCanBeParallelFor =
-          IsOpenMPAPI || SemaRef.getLangOpts().OpenMPNoNestedParallelism;
-      if (!TeamsLoopCanBeParallelFor)
-        return;
-    }
+    bool IsOpenMPAPI = false;
+    auto *FD = dyn_cast_or_null<FunctionDecl>(C->getCalleeDecl());
+    if (FD) {
+      std::string Name = FD->getNameInfo().getAsString();
+      IsOpenMPAPI = Name.find("omp_") == 0;
+    }
+    TeamsLoopCanBeParallelFor =
+        IsOpenMPAPI || SemaRef.getLangOpts().OpenMPNoNestedParallelism;
+    if (!TeamsLoopCanBeParallelFor)
+      return;
+
     for (const Stmt *Child : C->children())
       if (Child)
         Visit(Child);
@@ -24331,7 +24330,7 @@ SemaOpenMP::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
 
 OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause(
     Expr *Allocator, ArrayRef<Expr *> VarList, SourceLocation StartLoc,
-    SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
+    SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc) {
   if (Allocator) {
     // OpenMP [2.11.4 allocate Clause, Description]
     // allocator is an expression of omp_allocator_handle_t type.
diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
index 2438cf3..b735341 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
@@ -17,6 +17,7 @@
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
 #include "llvm/ADT/StringRef.h"
 
 using namespace clang;
@@ -26,16 +27,88 @@ namespace {
 class PointerSubChecker
   : public Checker< check::PreStmt<BinaryOperator> > {
   const BugType BT{this, "Pointer subtraction"};
+  const llvm::StringLiteral Msg_MemRegionDifferent =
+      "Subtraction of two pointers that do not point into the same array "
+      "is undefined behavior.";
+  const llvm::StringLiteral Msg_LargeArrayIndex =
+      "Using an array index greater than the array size at pointer subtraction "
+      "is undefined behavior.";
+  const llvm::StringLiteral Msg_NegativeArrayIndex =
+      "Using a negative array index at pointer subtraction "
+      "is undefined behavior.";
+  const llvm::StringLiteral Msg_BadVarIndex =
+      "Indexing the address of a variable with other than 1 at this place "
+      "is undefined behavior.";
+
+  bool checkArrayBounds(CheckerContext &C, const Expr *E,
+                        const ElementRegion *ElemReg,
+                        const MemRegion *Reg) const;
+  void reportBug(CheckerContext &C, const Expr *E,
+                 const llvm::StringLiteral &Msg) const;
 
 public:
   void checkPreStmt(const BinaryOperator *B, CheckerContext &C) const;
 };
 }
 
+bool PointerSubChecker::checkArrayBounds(CheckerContext &C, const Expr *E,
+                                         const ElementRegion *ElemReg,
+                                         const MemRegion *Reg) const {
+  if (!ElemReg)
+    return true;
+
+  ProgramStateRef State = C.getState();
+  const MemRegion *SuperReg = ElemReg->getSuperRegion();
+  SValBuilder &SVB = C.getSValBuilder();
+
+  if (SuperReg == Reg) {
+    if (const llvm::APSInt *I = SVB.getKnownValue(State, ElemReg->getIndex());
+        I && (!I->isOne() && !I->isZero()))
+      reportBug(C, E, Msg_BadVarIndex);
+    return false;
+  }
+
+  DefinedOrUnknownSVal ElemCount =
+      getDynamicElementCount(State, SuperReg, SVB, ElemReg->getElementType());
+  auto IndexTooLarge = SVB.evalBinOp(C.getState(), BO_GT, ElemReg->getIndex(),
+                                     ElemCount, SVB.getConditionType())
+                           .getAs<DefinedOrUnknownSVal>();
+  if (IndexTooLarge) {
+    ProgramStateRef S1, S2;
+    std::tie(S1, S2) = C.getState()->assume(*IndexTooLarge);
+    if (S1 && !S2) {
+      reportBug(C, E, Msg_LargeArrayIndex);
+      return false;
+    }
+  }
+  auto IndexTooSmall = SVB.evalBinOp(State, BO_LT, ElemReg->getIndex(),
+                                     SVB.makeZeroVal(SVB.getArrayIndexType()),
+                                     SVB.getConditionType())
+                           .getAs<DefinedOrUnknownSVal>();
+  if (IndexTooSmall) {
+    ProgramStateRef S1, S2;
+    std::tie(S1, S2) = State->assume(*IndexTooSmall);
+    if (S1 && !S2) {
+      reportBug(C, E, Msg_NegativeArrayIndex);
+      return false;
+    }
+  }
+  return true;
+}
+
+void PointerSubChecker::reportBug(CheckerContext &C, const Expr *E,
+                                  const llvm::StringLiteral &Msg) const {
+  if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
+    R->addRange(E->getSourceRange());
+    C.emitReport(std::move(R));
+  }
+}
+
 void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
                                      CheckerContext &C) const {
   // When doing pointer subtraction, if the two pointers do not point to the
-  // same memory chunk, emit a warning.
+  // same array, emit a warning.
   if (B->getOpcode() != BO_Sub)
     return;
 
@@ -44,28 +117,36 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
 
   const MemRegion *LR = LV.getAsRegion();
   const MemRegion *RR = RV.getAsRegion();
-
-  if (!(LR && RR))
+  if (!LR || !RR)
     return;
 
-  const MemRegion *BaseLR = LR->getBaseRegion();
-  const MemRegion *BaseRR = RR->getBaseRegion();
+  // Allow subtraction of identical pointers.
+  if (LR == RR)
+    return;
 
-  if (BaseLR == BaseRR)
+  // No warning if one operand is unknown.
+  if (isa<SymbolicRegion>(LR) || isa<SymbolicRegion>(RR))
     return;
 
-  // Allow arithmetic on different symbolic regions.
-  if (isa<SymbolicRegion>(BaseLR) || isa<SymbolicRegion>(BaseRR))
+  const auto *ElemLR = dyn_cast<ElementRegion>(LR);
+  const auto *ElemRR = dyn_cast<ElementRegion>(RR);
+
+  if (!checkArrayBounds(C, B->getLHS(), ElemLR, RR))
+    return;
+  if (!checkArrayBounds(C, B->getRHS(), ElemRR, LR))
     return;
 
-  if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
-    constexpr llvm::StringLiteral Msg =
-        "Subtraction of two pointers that do not point to the same memory "
-        "chunk may cause incorrect result.";
-    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
-    R->addRange(B->getSourceRange());
-    C.emitReport(std::move(R));
+  if (ElemLR && ElemRR) {
+    const MemRegion *SuperLR = ElemLR->getSuperRegion();
+    const MemRegion *SuperRR = ElemRR->getSuperRegion();
+    if (SuperLR == SuperRR)
+      return;
+    // Allow arithmetic on different symbolic regions.
+    if (isa<SymbolicRegion>(SuperLR) || isa<SymbolicRegion>(SuperRR))
+      return;
   }
+
+  reportBug(C, B, Msg_MemRegionDifferent);
 }
 
 void ento::registerPointerSubChecker(CheckerManager &mgr) {
diff --git a/clang/test/AST/Interp/builtin-align-cxx.cpp b/clang/test/AST/Interp/builtin-align-cxx.cpp
index c410395..a1edf30 100644
--- a/clang/test/AST/Interp/builtin-align-cxx.cpp
+++ b/clang/test/AST/Interp/builtin-align-cxx.cpp
@@ -202,8 +202,7 @@ static_assert(__builtin_align_down(&align32array[7], 4) == &align32array[4], "")
 static_assert(__builtin_align_down(&align32array[8], 4) == &align32array[8], "");
 
 // Achieving the same thing using casts to uintptr_t is not allowed:
-static_assert((char *)((__UINTPTR_TYPE__)&align32array[7] & ~3) == &align32array[4], ""); // both-error{{not an integral constant expression}} \
-                                                                                          // expected-note {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}}
+static_assert((char *)((__UINTPTR_TYPE__)&align32array[7] & ~3) == &align32array[4], ""); // both-error{{not an integral constant expression}}
 
 static_assert(__builtin_align_down(&align32array[1], 4) == &align32array[0], "");
 static_assert(__builtin_align_down(&align32array[1], 64) == &align32array[0], ""); // both-error{{not an integral constant expression}}
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index f4c7bf1..1cc450e 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -66,12 +66,10 @@ _Static_assert((&a - 100) != 0, ""); // pedantic-ref-warning {{is a GNU extensio
                                      // pedantic-ref-note {{-100 of non-array}} \
                                      // pedantic-expected-note {{-100 of non-array}}
 /// extern variable of a composite type.
-/// FIXME: The 'this conversion is not allowed' note is missing in the new interpreter.
 extern struct Test50S Test50;
 _Static_assert(&Test50 != (void*)0, ""); // all-warning {{always true}} \
-                                         // pedantic-ref-warning {{is a GNU extension}} \
-                                         // pedantic-ref-note {{this conversion is not allowed in a constant expression}} \
-                                         // pedantic-expected-warning {{is a GNU extension}}
+                                         // pedantic-warning {{is a GNU extension}} \
+                                         // pedantic-note {{this conversion is not allowed in a constant expression}}
 
 struct y {int x,y;};
 int a2[(intptr_t)&((struct y*)0)->y]; // all-warning {{folded to constant array}}
diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index 09cb620..003f33e 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -115,6 +115,11 @@ static_assert(__imag(Doubles[2]) == 0.0, "");
 static_assert(__real(Doubles[3]) == 0.0, "");
 static_assert(__imag(Doubles[3]) == 0.0, "");
 
+static_assert(~(0.5 + 1.5j) == (0.5 + -1.5j), "");
+
+static_assert(__extension__ __imag(A) == 0, "");
+static_assert(__imag(__extension__ A) == 0, "");
+
 void func(void) {
   __complex__ int arr;
   _Complex int result;
diff --git a/clang/test/AST/Interp/const-eval.c b/clang/test/AST/Interp/const-eval.c
index 72c0833..eab14c0 100644
--- a/clang/test/AST/Interp/const-eval.c
+++ b/clang/test/AST/Interp/const-eval.c
@@ -140,15 +140,8 @@ EVAL_EXPR(47, &x < &x + 1 ? 1 : -1)
 EVAL_EXPR(48, &x != &x - 1 ? 1 : -1)
 EVAL_EXPR(49, &x < &x - 100 ? 1 : -1) // ref-error {{not an integer constant expression}}
 
-/// FIXME: Rejecting this is correct, BUT when converting the innermost pointer
-/// to an integer, we do not preserve the information where it came from. So when we later
-/// create a pointer from it, it also doesn't have that information, which means
-/// hasSameBase() for those two pointers will return false. And in those cases, we emit
-/// the diagnostic:
-///   comparison between '&Test50' and '&(631578)' has unspecified value
 extern struct Test50S Test50;
-EVAL_EXPR(50, &Test50 < (struct Test50S*)((unsigned long)&Test50 + 10)) // both-error {{not an integer constant expression}} \
-                                                                        // expected-note {{comparison between}}
+EVAL_EXPR(50, &Test50 < (struct Test50S*)((unsigned long)&Test50 + 10)) // both-error {{not an integer constant expression}}
 
 EVAL_EXPR(51, 0 != (float)1e99)
 
diff --git a/clang/test/Analysis/casts.c b/clang/test/Analysis/casts.c
index 30cd74b..7dad4ed 100644
--- a/clang/test/Analysis/casts.c
+++ b/clang/test/Analysis/casts.c
@@ -138,7 +138,9 @@ void multiDimensionalArrayPointerCasts(void) {
   clang_analyzer_eval(y1 == y2); // expected-warning{{TRUE}}
 
   // FIXME: should be FALSE (i.e. equal pointers).
+  // FIXME: pointer subtraction warning might be incorrect
   clang_analyzer_eval(y1 - y2); // expected-warning{{UNKNOWN}}
+  // expected-warning@-1{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
   // FIXME: should be TRUE (i.e. same symbol).
   clang_analyzer_eval(*y1 == *y2); // expected-warning{{UNKNOWN}}
 
@@ -147,7 +149,9 @@ void multiDimensionalArrayPointerCasts(void) {
   clang_analyzer_eval(y1 == y3); // expected-warning{{TRUE}}
 
   // FIXME: should be FALSE (i.e. equal pointers).
+  // FIXME: pointer subtraction warning might be incorrect
   clang_analyzer_eval(y1 - y3); // expected-warning{{UNKNOWN}}
+  // expected-warning@-1{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
   // FIXME: should be TRUE (i.e. same symbol).
   clang_analyzer_eval(*y1 == *y3); // expected-warning{{UNKNOWN}}
 
diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c
new file mode 100644
index 0000000..9a44654
--- /dev/null
+++ b/clang/test/Analysis/pointer-sub.c
@@ -0,0 +1,120 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.PointerSub -verify %s
+
+void f1(void) {
+  int x, y, z[10];
+  int d = &y - &x; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+  d = z - &y; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
+  d = &x - &x; // no-warning (subtraction of any two identical pointers is allowed)
+  d = (long *)&x - (long *)&x;
+  d = (&x + 1) - &x; // no-warning ('&x' is like a single-element array)
+  d = &x - (&x + 1); // no-warning
+  d = (&x + 0) - &x; // no-warning
+  d = (&x - 1) - &x; // expected-warning{{Indexing the address of a variable with other than 1 at this place is undefined behavior}}
+  d = (&x + 2) - &x; // expected-warning{{Indexing the address of a variable with other than 1 at this place is undefined behavior}}
+
+  d = (z + 9) - z; // no-warning (pointers to same array)
+  d = (z + 10) - z; // no-warning (pointer to "one after the end")
+  d = (z + 11) - z; // expected-warning{{Using an array index greater than the array size at pointer subtraction is undefined behavior}}
+  d = (z - 1) - z; // expected-warning{{Using a negative array index at pointer subtraction is undefined behavior}}
+}
+
+void f2(void) {
+  int a[10], b[10], c;
+  int *p = &a[2];
+  int *q = &a[8];
+  int d = q - p; // no-warning (pointers into the same array)
+
+  q = &b[3];
+  d = q - p; // expected-warning{{Subtraction of two pointers that}}
+
+  q = a + 10;
+  d = q - p; // no warning (use of pointer to one after the end is allowed)
+  q = a + 11;
+  d = q - a; // expected-warning{{Using an array index greater than the array size at pointer subtraction is undefined behavior}}
+
+  d = &a[4] - a; // no-warning
+  d = &a[2] - p; // no-warning
+  d = &c - p; // expected-warning{{Subtraction of two pointers that}}
+
+  d = (int *)((char *)(&a[4]) + sizeof(int)) - &a[4]; // no-warning (pointers into the same array data)
+  d = (int *)((char *)(&a[4]) + 1) - &a[4]; // expected-warning{{Subtraction of two pointers that}}
+}
+
+void f3(void) {
+  int a[3][4];
+  int d;
+
+  d = &(a[2]) - &(a[1]);
+  d = a[2] - a[1]; // expected-warning{{Subtraction of two pointers that}}
+  d = a[1] - a[1];
+  d = &(a[1][2]) - &(a[1][0]);
+  d = &(a[1][2]) - &(a[0][0]); // expected-warning{{Subtraction of two pointers that}}
+
+  d = (int *)((char *)(&a[2][2]) + sizeof(int)) - &a[2][2]; // expected-warning{{Subtraction of two pointers that}}
+  d = (int *)((char *)(&a[2][2]) + 1) - &a[2][2]; // expected-warning{{Subtraction of two pointers that}}
+  d = (int (*)[4])((char *)&a[2] + sizeof(int (*)[4])) - &a[2]; // expected-warning{{Subtraction of two pointers that}}
+  d = (int (*)[4])((char *)&a[2] + 1) - &a[2]; // expected-warning{{Subtraction of two pointers that}}
+}
+
+void f4(void) {
+  int n = 4, m = 3;
+  int a[n][m];
+  int (*p)[m] = a; // p == &a[0]
+  p += 1; // p == &a[1]
+
+  // FIXME: This is a known problem with -Wpointer-arith (https://github.com/llvm/llvm-project/issues/28328)
+  int d = p - a; // d == 1 // expected-warning{{subtraction of pointers to type 'int[m]' of zero size has undefined behavior}}
+
+  // FIXME: This is a known problem with -Wpointer-arith (https://github.com/llvm/llvm-project/issues/28328)
+  d = &(a[2]) - &(a[1]); // expected-warning{{subtraction of pointers to type 'int[m]' of zero size has undefined behavior}}
+
+  d = a[2] - a[1]; // expected-warning{{Subtraction of two pointers that}}
+}
+
+typedef struct {
+  int a;
+  int b;
+  int c[10];
+  int d[10];
+} S;
+
+void f5(void) {
+  S s;
+  int y;
+  int d;
+
+  d = &s.b - &s.a; // expected-warning{{Subtraction of two pointers that}}
+  d = &s.c[0] - &s.a; // expected-warning{{Subtraction of two pointers that}}
+  d = &s.b - &y; // expected-warning{{Subtraction of two pointers that}}
+  d = &s.c[3] - &s.c[2];
+  d = &s.d[3] - &s.c[2]; // expected-warning{{Subtraction of two pointers that}}
+  d = s.d - s.c; // expected-warning{{Subtraction of two pointers that}}
+
+  S sa[10];
+  d = &sa[2] - &sa[1];
+  d = &sa[2].a - &sa[1].b; // expected-warning{{Subtraction of two pointers that}}
+}
+
+void f6(void) {
+  long long l;
+  char *a1 = (char *)&l;
+  int d = a1[3] - l;
+
+  long long la1[3];
+  long long la2[3];
+  char *pla1 = (char *)la1;
+  char *pla2 = (char *)la2;
+  d = pla1[1] - pla1[0];
+  d = (long long *)&pla1[1] - &l; // expected-warning{{Subtraction of two pointers that}}
+  d = &pla2[3] - &pla1[3]; // expected-warning{{Subtraction of two pointers that}}
+}
+
+void f7(int *p) {
+  int a[10];
+  int d = &a[10] - p; // no-warning ('p' is unknown, even if it cannot point into 'a')
+}
+
+void f8(int n) {
+  int a[10];
+  int d = a[n] - a[0];
+}
diff --git a/clang/test/Analysis/ptr-arith.c b/clang/test/Analysis/ptr-arith.c
index 40c81887..f99dfab 100644
--- a/clang/test/Analysis/ptr-arith.c
+++ b/clang/test/Analysis/ptr-arith.c
@@ -1,5 +1,5 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.FixedAddr,alpha.core.PointerArithm,alpha.core.PointerSub,debug.ExprInspection -Wno-pointer-to-int-cast -verify -triple x86_64-apple-darwin9 -Wno-tautological-pointer-compare -analyzer-config eagerly-assume=false %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.FixedAddr,alpha.core.PointerArithm,alpha.core.PointerSub,debug.ExprInspection -Wno-pointer-to-int-cast -verify -triple i686-apple-darwin9 -Wno-tautological-pointer-compare -analyzer-config eagerly-assume=false %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.FixedAddr,alpha.core.PointerArithm,debug.ExprInspection -Wno-pointer-to-int-cast -verify -triple x86_64-apple-darwin9 -Wno-tautological-pointer-compare -analyzer-config eagerly-assume=false %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.core.FixedAddr,alpha.core.PointerArithm,debug.ExprInspection -Wno-pointer-to-int-cast -verify -triple i686-apple-darwin9 -Wno-tautological-pointer-compare -analyzer-config eagerly-assume=false %s
 
 void clang_analyzer_eval(int);
 void clang_analyzer_dump(int);
@@ -35,16 +35,6 @@ domain_port (const char *domain_b, const char *domain_e,
   return port;
 }
 
-void f3(void) {
-  int x, y;
-  int d = &y - &x; // expected-warning{{Subtraction of two pointers that do not point to the same memory chunk may cause incorrect result}}
-
-  int a[10];
-  int *p = &a[2];
-  int *q = &a[8];
-  d = q-p; // no-warning
-}
-
 void f4(void) {
   int *p;
   p = (int*) 0x10000; // expected-warning{{Using a fixed address is not portable because that address will probably not be valid in all environments or platforms}}
diff --git a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp
index 4416c82..51990ee 100644
--- a/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p3.cpp
@@ -212,7 +212,7 @@ constexpr int ClassDecl3() {
   return 0;
 }
 
-constexpr int NoReturn() {} // expected-error {{no return statement in constexpr function}}
+constexpr int NoReturn() {} // beforecxx23-error {{no return statement in constexpr function}}
 constexpr int MultiReturn() {
   return 0; // beforecxx14-note {{return statement}}
   return 0; // beforecxx14-warning {{multiple return statements in constexpr function}}
diff --git a/clang/test/CodeGen/aarch64-cpu-supports-target.c b/clang/test/CodeGen/aarch64-cpu-supports-target.c
index e023944..28187bcf 100644
--- a/clang/test/CodeGen/aarch64-cpu-supports-target.c
+++ b/clang/test/CodeGen/aarch64-cpu-supports-target.c
@@ -48,5 +48,5 @@ int test_versions() {
     return code();
 }
 // CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon" }
-// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
index af8933d..9885ac4 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 \
 // RUN:   -disable-O0-optnone -Werror -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | opt -S -passes=inline \
diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index 3e7a209..644e6a6 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -58,58 +58,50 @@ void v1msve() {}
 // CHECK-LABEL: @plussve() #12
 __attribute__((target("+sve")))
 void plussve() {}
-// CHECK-LABEL: @plussveplussve2() #13
+// CHECK-LABEL: @plussveplussve2() #12
 __attribute__((target("+sve+nosve2")))
 void plussveplussve2() {}
-// CHECK-LABEL: @plussveminusnosve2() #13
+// CHECK-LABEL: @plussveminusnosve2() #12
 __attribute__((target("sve,no-sve2")))
 void plussveminusnosve2() {}
-// CHECK-LABEL: @plusfp16() #14
+// CHECK-LABEL: @plusfp16() #13
 __attribute__((target("+fp16")))
 void plusfp16() {}
 
-// CHECK-LABEL: @all() #15
+// CHECK-LABEL: @all() #14
 __attribute__((target("cpu=neoverse-n1,tune=cortex-a710,arch=armv8.6-a+sve2")))
 void all() {}
-// CHECK-LABEL: @allplusbranchprotection() #16
+// CHECK-LABEL: @allplusbranchprotection() #15
 __attribute__((target("cpu=neoverse-n1,tune=cortex-a710,arch=armv8.6-a+sve2,branch-protection=standard")))
 void allplusbranchprotection() {}
 
-// These tests check that the user facing and internal llvm name are both accepted.
-// CHECK-LABEL: @plusnoneon() #17
-__attribute__((target("+noneon")))
-void plusnoneon() {}
-// CHECK-LABEL: @plusnosimd() #17
+// CHECK-LABEL: @plusnosimd() #16
 __attribute__((target("+nosimd")))
 void plusnosimd() {}
-// CHECK-LABEL: @noneon() #17
-__attribute__((target("no-neon")))
-void noneon() {}
-// CHECK-LABEL: @nosimd() #17
+// CHECK-LABEL: @nosimd() #16
 __attribute__((target("no-simd")))
 void nosimd() {}
 
 // This isn't part of the standard interface, but test that -arch features should not apply anything else.
-// CHECK-LABEL: @minusarch() #18
+// CHECK-LABEL: @minusarch() #17
 __attribute__((target("no-v9.3a")))
 void minusarch() {}
 
 // CHECK: attributes #0 = { {{.*}} "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #1 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #2 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" }
-// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
-// CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm" }
+// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
+// CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
 // CHECK: attributes #5 = { {{.*}} "tune-cpu"="cortex-a710" }
 // CHECK: attributes #6 = { {{.*}} "target-cpu"="generic" }
 // CHECK: attributes #7 = { {{.*}} "tune-cpu"="generic" }
-// CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2" }
-// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,-sve" }
-// CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
-// CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-sve2" }
-// CHECK: attributes #14 = { {{.*}} "target-features"="+fullfp16" }
-// CHECK: attributes #15 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #16 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #17 = { {{.*}} "target-features"="-neon" }
-// CHECK: attributes #18 = { {{.*}} "target-features"="-v9.3a" }
+// CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.1a,+v8.2a,+v8a" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+sve" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
+// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,-sve" }
+// CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+sve" }
+// CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16" }
+// CHECK: attributes #14 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #15 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
+// CHECK-NOT: attributes #16 = {{.*}} "target-features"
+// CHECK: attributes #17 = { {{.*}} "target-features"="-v9.3a" }
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 35977113..75f8734 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -1129,42 +1129,42 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //.
-// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp16fml,+fullfp16,+neon,+rand,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+neon,+rand,-v9.5a" }
 // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+sme,+sme-i16i64,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+sha2,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+ls64,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp16fml,+fullfp16,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+sha2,-v9.5a" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+ls64,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,-v9.5a" }
 // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR9:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,-v9.5a" }
 // CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
 // CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fullfp16,+neon,+rdm,+sme,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" }
+// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" }
+// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
+// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme,-v9.5a" }
+// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-v9.5a" }
+// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
 // CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+jsconv,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+jsconv,+neon,-v9.5a" }
 // CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
+// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-v9.5a" }
+// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-v9.5a" }
+// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-v9.5a" }
 // CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+sm4,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+rdm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4,-v9.5a" }
+// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+rdm,-v9.5a" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
diff --git a/clang/test/CodeGen/instrument-objc-method.m b/clang/test/CodeGen/instrument-objc-method.m
index cfc0a0a..2c9d1fc 100644
--- a/clang/test/CodeGen/instrument-objc-method.m
+++ b/clang/test/CodeGen/instrument-objc-method.m
@@ -11,16 +11,16 @@
 + (void)initialize {
 }
 
-// PREINLINE: declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-// BARE: @"\01+[ObjCClass load]"{{\(.*\)}} #2
+// BARE: @"\01+[ObjCClass load]"{{\(.*\)}} #1
 + (void)load __attribute__((no_instrument_function)) {
 }
 
-// PREINLINE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #2
-// BARE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #2
+// PREINLINE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #1
+// BARE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #1
 - (void)dealloc __attribute__((no_instrument_function)) {
 }
 
+// PREINLINE: declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 // PREINLINE: attributes #0 = { {{.*}}"instrument-function-entry"="__cyg_profile_func_enter"
 // PREINLINE-NOT: attributes #0 = { {{.*}}"instrument-function-entry"="__cyg_profile_func_enter_bare"
 // PREINLINE-NOT: attributes #2 = { {{.*}}"__cyg_profile_func_enter"
diff --git a/clang/test/CodeGen/paren-list-agg-init.cpp b/clang/test/CodeGen/paren-list-agg-init.cpp
index 94d4243..88b1834 100644
--- a/clang/test/CodeGen/paren-list-agg-init.cpp
+++ b/clang/test/CodeGen/paren-list-agg-init.cpp
@@ -271,14 +271,13 @@ const int* foo10() {
 // CHECK-NEXT: [[ARR_2:%.*]] = alloca [4 x i32], align 16
 // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
-// CHECK-NEXT: [[ARRINIT_BEGIN:%.*]] = getelementptr inbounds [4 x i32], ptr [[ARR_2]], i64 0, i64 0
 // CHECK-NEXT: [[TMP_0:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-NEXT: store i32 [[TMP_0]], ptr [[ARRINIT_BEGIN]], align 4
-// CHECK-NEXT: [[ARRINIT_ELEM:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_BEGIN]], i64 1
+// CHECK-NEXT: store i32 [[TMP_0]], ptr [[ARR_2]], align 4
+// CHECK-NEXT: [[ARRINIT_ELEM:%.*]] = getelementptr inbounds i32, ptr [[ARR_2]], i64 1
 // CHECK-NEXT: [[TMP_1:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK-NEXT: store i32 [[TMP_1]], ptr [[ARRINIT_ELEM]], align 4
-// CHECK-NEXT: [[ARRINIT_START:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_ELEM]], i64 1
-// CHECK-NEXT: [[ARRINIT_END:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_BEGIN]], i64 4
+// CHECK-NEXT: [[ARRINIT_START:%.*]] = getelementptr inbounds i32, ptr [[ARR_2]], i64 2
+// CHECK-NEXT: [[ARRINIT_END:%.*]] = getelementptr inbounds i32, ptr [[ARR_2]], i64 4
 // CHECK-NEXT: br label [[ARRINIT_BODY:%.*]]
 // CHECK: [[ARRINIT_CUR:%.*]] = phi ptr [ [[ARRINIT_START]], %entry ], [ [[ARRINIT_NEXT:%.*]], [[ARRINIT_BODY]] ]
 // CHECK-NEXT: store i32 0, ptr [[ARRINIT_CUR]], align 4
@@ -297,10 +296,9 @@ void foo11(int a, int b) {
 // CHECK-NEXT: [[ARR_3:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4
-// CHECK-NEXT: [[ARRINIT_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARR_3]], i64 0, i64 0
 // CHECK-NEXT: [[TMP_0:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-NEXT: store i32 [[TMP_0]], ptr [[ARRINIT_BEGIN]], align 4
-// CHECK-NEXT: [[ARRINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_BEGIN]], i64 1
+// CHECK-NEXT: store i32 [[TMP_0]], ptr [[ARR_3]], align 4
+// CHECK-NEXT: [[ARRINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[ARR_3]], i64 1
 // CHECK-NEXT: [[TMP_1:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK-NEXT: store i32 [[TMP_1]], ptr [[ARRINIT_ELEMENT]], align 4
 // CHECK-NEXT: ret void
@@ -336,8 +334,7 @@ const int* foo15() {
 // CHECK-NEXT: entry:
 // CHECK-NEXT: [[ARR_6:%.*arr6.*]] = alloca ptr, align 8
 // CHECK-NEXT: [[REF_TMP:%.*]] = alloca [1 x i32], align 4
-// CHECK-NEXT: [[ARRINIT_BEGIN:%.*]] = getelementptr inbounds [1 x i32], ptr [[REF_TMP]], i64 0, i64 0
-// CHECK-NEXT: store i32 3, ptr [[ARRINIT_BEGIN]], align 4
+// CHECK-NEXT: store i32 3, ptr [[REF_TMP]], align 4
 // CHECK-NEXT: store ptr [[REF_TMP]], ptr [[ARR_6]], align 8
 // CHECK-NEXT: ret void
 void foo16() {
@@ -348,10 +345,9 @@ void foo16() {
 // CHECK-NEXT: entry:
 // CHECK-NEXT: [[ARR_7:%.*arr7.*]] = alloca ptr, align 8
 // CHECK-NEXT: [[REF_TMP:%.*]] = alloca [2 x i32], align 4
-// CHECK-NEXT: [[ARRINIT_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[REF_TMP]], i64 0, i64 0
-// CHECK-NEXT: store i32 4, ptr [[ARRINIT_BEGIN]], align 4
-// CHECK-NEXT: [[ARRINIT_START:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_BEGIN]], i64 1
-// CHECK-NEXT: [[ARRINIT_END:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_BEGIN]], i64 2
+// CHECK-NEXT: store i32 4, ptr [[REF_TMP]], align 4
+// CHECK-NEXT: [[ARRINIT_START:%.*]] = getelementptr inbounds i32, ptr [[REF_TMP]], i64 1
+// CHECK-NEXT: [[ARRINIT_END:%.*]] = getelementptr inbounds i32, ptr [[REF_TMP]], i64 2
 // CHECK-NEXT: br label [[ARRINIT_BODY]]
 // CHECK: [[ARRINIT_CUR:%.*]] = phi ptr [ [[ARRINIT_START]], %entry ], [ [[ARRINIT_NEXT:%.*]], [[ARRINIT_BODY]] ]
 // CHECK-NEXT: store i32 0, ptr [[ARRINIT_CUR]], align 4
@@ -533,14 +529,12 @@ namespace gh68198 {
   // CHECK-NEXT: entry
   // CHECK-NEXT: [[ARR_10:%.*arr9.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[CALL_PTR]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 16)
-  // CHECK-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[CALL]], i64 0, i64 0
-  // CHECK-NEXT: store i32 1, ptr [[ARRAYINIT_BEGIN]], align 4
-  // CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN]], i64 1
+  // CHECK-NEXT: store i32 1, ptr [[CALL]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 1
   // CHECK-NEXT: store i32 2, ptr [[ARRAYINIT_ELEMENT]], align 4
   // CHECK-NEXT: [[ARRAY_EXP_NEXT:%.*]] = getelementptr inbounds [2 x i32], ptr %call, i64 1
-  // CHECK-NEXT: [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 0, i64 0
-  // CHECK-NEXT: store i32 3, ptr [[ARRAYINIT_BEGIN1]], align 4
-  // CHECK-NEXT: [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN1]], i64 1
+  // CHECK-NEXT: store i32 3, ptr [[ARRAY_EXP_NEXT]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY_EXP_NEXT]], i64 1
   // CHECK-NEXT: store i32 4, ptr [[ARRAYINIT_ELEMENT2]], align 4
   // CHECK-NEXT: [[ARRAY_EXP_NEXT3:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 1
   // CHECK-NEXT: store ptr [[CALL_PTR]], ptr [[ARR_10]], align 8
@@ -553,14 +547,12 @@ namespace gh68198 {
   // CHECK-NEXT: entry
   // CHECK-NEXT: [[ARR_10:%.*arr10.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[CALL_PTR]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 32)
-  // CHECK-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[CALL]], i64 0, i64 0
-  // CHECK-NEXT: store i32 5, ptr [[ARRAYINIT_BEGIN]], align 4
-  // CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN]], i64 1
+  // CHECK-NEXT: store i32 5, ptr [[CALL]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 1
   // CHECK-NEXT: store i32 6, ptr [[ARRAYINIT_ELEMENT]], align 4
   // CHECK-NEXT: [[ARRAY_EXP_NEXT:%.*]] = getelementptr inbounds [2 x i32], ptr %call, i64 1
-  // CHECK-NEXT: [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 0, i64 0
-  // CHECK-NEXT: store i32 7, ptr [[ARRAYINIT_BEGIN1]], align 4
-  // CHECK-NEXT: [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN1]], i64 1
+  // CHECK-NEXT: store i32 7, ptr [[ARRAY_EXP_NEXT]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY_EXP_NEXT]], i64 1
   // CHECK-NEXT: store i32 8, ptr [[ARRAYINIT_ELEMENT2]], align 4
   // CHECK-NEXT: [[ARRAY_EXP_NEXT3:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 1
   // CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ARRAY_EXP_NEXT3]], i8 0, i64 16, i1 false)
diff --git a/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp b/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp
index fb236ae..81d9334 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp
@@ -221,5 +221,5 @@ void tu2(int &i) {
   }
 }
 
-// CHECK: [[BW_LIKELY]] = !{!"branch_weights", i32 2000, i32 1}
-// CHECK: [[BW_UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000}
+// CHECK: [[BW_LIKELY]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
+// CHECK: [[BW_UNLIKELY]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
index ac466ee5..4eafa72 100644
--- a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
+++ b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
@@ -158,16 +158,15 @@ void ArrayInit() {
   // CHECK-LABEL: define dso_local void @_Z9ArrayInitv()
   // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
   // CHECK: %cleanup.dest.slot = alloca i32, align 4
-  // CHECK: %arrayinit.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i64 0, i64 0
-  // CHECK: store ptr %arrayinit.begin, ptr %arrayinit.endOfInit, align 8
+  // CHECK: store ptr %arr, ptr %arrayinit.endOfInit, align 8
   Printy arr[4] = {
     Printy("a"),
-    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
-    // CHECK: [[ARRAYINIT_ELEMENT1:%.+]] = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
+    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arr, ptr noundef @.str)
+    // CHECK: [[ARRAYINIT_ELEMENT1:%.+]] = getelementptr inbounds %struct.Printy, ptr %arr, i64 1
     // CHECK: store ptr [[ARRAYINIT_ELEMENT1]], ptr %arrayinit.endOfInit, align 8
     Printy("b"),
     // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT1]], ptr noundef @.str.1)
-    // CHECK: [[ARRAYINIT_ELEMENT2:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAYINIT_ELEMENT1]], i64 1
+    // CHECK: [[ARRAYINIT_ELEMENT2:%.+]] = getelementptr inbounds %struct.Printy, ptr %arr, i64 2
     // CHECK: store ptr [[ARRAYINIT_ELEMENT2]], ptr %arrayinit.endOfInit, align 8
     ({
     // CHECK: br i1 {{.*}}, label %if.then, label %if.end
@@ -180,7 +179,7 @@ void ArrayInit() {
       // CHECK:       if.end:
       Printy("c");
       // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-      // CHECK-NEXT:    %arrayinit.element2 = getelementptr inbounds %struct.Printy, ptr %arrayinit.element1, i64 1
+      // CHECK-NEXT:    %arrayinit.element2 = getelementptr inbounds %struct.Printy, ptr %arr, i64 3
       // CHECK-NEXT:    store ptr %arrayinit.element2, ptr %arrayinit.endOfInit, align 8
     }),
     ({
@@ -212,14 +211,14 @@ void ArrayInit() {
 
   // CHECK:       cleanup:
   // CHECK-NEXT:    %1 = load ptr, ptr %arrayinit.endOfInit, align 8
-  // CHECK-NEXT:    %arraydestroy.isempty = icmp eq ptr %arrayinit.begin, %1
+  // CHECK-NEXT:    %arraydestroy.isempty = icmp eq ptr %arr, %1
   // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2:.+]]
 
   // CHECK:       [[ARRAY_DESTROY_BODY2]]:
   // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %1, %cleanup ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY2]] ]
   // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
   // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
-  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arrayinit.begin
+  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arr
   // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE2]], label %[[ARRAY_DESTROY_BODY2]]
 
   // CHECK:       [[ARRAY_DESTROY_DONE2]]:
@@ -238,8 +237,7 @@ void ArraySubobjects() {
       // CHECK: call void @_ZN6PrintyC1EPKc
       // CHECK: call void @_ZN6PrintyC1EPKc
       {Printy("a"),
-      // CHECK: [[ARRAYINIT_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy]
-      // CHECK: store ptr [[ARRAYINIT_BEGIN]], ptr %arrayinit.endOfInit, align 8
+      // CHECK: store ptr %arr2, ptr %arrayinit.endOfInit, align 8
       // CHECK: call void @_ZN6PrintyC1EPKc
       // CHECK: [[ARRAYINIT_ELEMENT:%.+]] = getelementptr inbounds %struct.Printy
       // CHECK: store ptr [[ARRAYINIT_ELEMENT]], ptr %arrayinit.endOfInit, align 8
@@ -248,7 +246,7 @@ void ArraySubobjects() {
            return;
            // CHECK:      if.then:
            // CHECK-NEXT:   [[V0:%.+]] = load ptr, ptr %arrayinit.endOfInit, align 8
-           // CHECK-NEXT:   %arraydestroy.isempty = icmp eq ptr [[ARRAYINIT_BEGIN]], [[V0]]
+           // CHECK-NEXT:   %arraydestroy.isempty = icmp eq ptr %arr2, [[V0]]
            // CHECK-NEXT:   br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE:.+]], label %[[ARRAY_DESTROY_BODY:.+]]
          }
          Printy("b");
@@ -268,7 +266,7 @@ void ArraySubobjects() {
     // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %0, %if.then ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY]] ]
     // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
     // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
-    // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, [[ARRAYINIT_BEGIN]]
+    // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arr2
     // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE]], label %[[ARRAY_DESTROY_BODY]]
 
     // CHECK:       [[ARRAY_DESTROY_DONE]]
@@ -277,11 +275,11 @@ void ArraySubobjects() {
     // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY2:.+]]
 
     // CHECK:       [[ARRAY_DESTROY_BODY2]]:
-    // CHECK-NEXT:    %arraydestroy.elementPast5 = phi ptr [ %1, %[[ARRAY_DESTROY_DONE]] ], [ %arraydestroy.element6, %[[ARRAY_DESTROY_BODY2]] ]
-    // CHECK-NEXT:    %arraydestroy.element6 = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast5, i64 -1
-    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element6)
-    // CHECK-NEXT:    %arraydestroy.done7 = icmp eq ptr %arraydestroy.element6, [[ARRAY_BEGIN]]
-    // CHECK-NEXT:    br i1 %arraydestroy.done7, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2]]
+    // CHECK-NEXT:    %arraydestroy.elementPast4 = phi ptr [ %1, %[[ARRAY_DESTROY_DONE]] ], [ %arraydestroy.element5, %[[ARRAY_DESTROY_BODY2]] ]
+    // CHECK-NEXT:    %arraydestroy.element5 = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast4, i64 -1
+    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element5)
+    // CHECK-NEXT:    %arraydestroy.done6 = icmp eq ptr %arraydestroy.element5, [[ARRAY_BEGIN]]
+    // CHECK-NEXT:    br i1 %arraydestroy.done6, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2]]
 
 
     // CHECK:     [[ARRAY_DESTROY_DONE2]]:
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp
index b19ca1d..419525d 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp
@@ -52,11 +52,10 @@ namespace reference {
     // CHECK-NEXT: store ptr %{{.*}}, ptr %{{.*}}, align
     const A &ra1{1, i};
 
-    // CHECK-NEXT: getelementptr inbounds [3 x i32], ptr %{{.*}}, i{{32|64}} 0, i{{32|64}} 0
     // CHECK-NEXT: store i32 1
     // CHECK-NEXT: getelementptr inbounds i32, ptr %{{.*}}, i{{32|64}} 1
     // CHECK-NEXT: store i32 2
-    // CHECK-NEXT: getelementptr inbounds i32, ptr %{{.*}}, i{{32|64}} 1
+    // CHECK-NEXT: getelementptr inbounds i32, ptr %{{.*}}, i{{32|64}} 2
     // CHECK-NEXT: %[[I2:.*]] = load i32, ptr
     // CHECK-NEXT: store i32 %[[I2]]
     // CHECK-NEXT: store ptr %{{.*}}, ptr %{{.*}}, align
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp
index d9e4c5d..e2d5661 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp
@@ -40,8 +40,7 @@ void fn1(int i) {
   // CHECK-LABEL: define{{.*}} void @_Z3fn1i
   // temporary array
   // CHECK: [[array:%[^ ]+]] = alloca [3 x i32]
-  // CHECK: getelementptr inbounds [3 x i32], ptr [[array]], i{{32|64}} 0
-  // CHECK-NEXT: store i32 1, ptr
+  // CHECK:      store i32 1, ptr
   // CHECK-NEXT: getelementptr
   // CHECK-NEXT: store
   // CHECK-NEXT: getelementptr
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
index aa2f078..3d0cf96 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
@@ -122,8 +122,7 @@ void fn1(int i) {
   // X86: [[array:%[^ ]+]] = alloca [3 x i32]
   // AMDGCN: [[alloca:%[^ ]+]] = alloca [3 x i32], align 4, addrspace(5)
   // AMDGCN: [[array:%[^ ]+]] ={{.*}} addrspacecast ptr addrspace(5) [[alloca]] to ptr
-  // CHECK: getelementptr inbounds [3 x i32], ptr [[array]], i{{32|64}} 0
-  // CHECK-NEXT: store i32 1, ptr
+  // CHECK:      store i32 1, ptr
   // CHECK-NEXT: getelementptr
   // CHECK-NEXT: store
   // CHECK-NEXT: getelementptr
diff --git a/clang/test/CodeGenCXX/cxx11-initializer-array-new.cpp b/clang/test/CodeGenCXX/cxx11-initializer-array-new.cpp
index 48ee019..d86c712 100644
--- a/clang/test/CodeGenCXX/cxx11-initializer-array-new.cpp
+++ b/clang/test/CodeGenCXX/cxx11-initializer-array-new.cpp
@@ -16,22 +16,20 @@ void *p = new S[2][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 // { 1, 2, 3 }
 //
 //
-// CHECK: %[[S_0_0:.*]] = getelementptr inbounds [3 x %[[S:.*]]], ptr %[[START_AS_i8]], i64 0, i64 0
-// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_0_0]], i32 noundef 1)
-// CHECK: %[[S_0_1:.*]] = getelementptr inbounds %[[S]], ptr %[[S_0_0]], i64 1
+// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[START_AS_i8]], i32 noundef 1)
+// CHECK: %[[S_0_1:.*]] = getelementptr inbounds %[[S:.+]], ptr %[[START_AS_i8]], i64 1
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_0_1]], i32 noundef 2)
-// CHECK: %[[S_0_2:.*]] = getelementptr inbounds %[[S]], ptr %[[S_0_1]], i64 1
+// CHECK: %[[S_0_2:.*]] = getelementptr inbounds %[[S]], ptr %[[START_AS_i8]], i64 2
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_0_2]], i32 noundef 3)
 //
 // { 4, 5, 6 }
 //
 // CHECK: %[[S_1:.*]] = getelementptr inbounds [3 x %[[S]]], ptr %[[START_AS_i8]], i64 1
 //
-// CHECK: %[[S_1_0:.*]] = getelementptr inbounds [3 x %[[S]]], ptr %[[S_1]], i64 0, i64 0
-// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1_0]], i32 noundef 4)
-// CHECK: %[[S_1_1:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1_0]], i64 1
+// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1]], i32 noundef 4)
+// CHECK: %[[S_1_1:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1]], i64 1
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1_1]], i32 noundef 5)
-// CHECK: %[[S_1_2:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1_1]], i64 1
+// CHECK: %[[S_1_2:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1]], i64 2
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1_2]], i32 noundef 6)
 //
 // CHECK-NOT: br i1
@@ -57,22 +55,20 @@ void *q = new S[n][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 // { 1, 2, 3 }
 //
 //
-// CHECK: %[[S_0_0:.*]] = getelementptr inbounds [3 x %[[S]]], ptr %[[START_AS_i8]], i64 0, i64 0
-// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_0_0]], i32 noundef 1)
-// CHECK: %[[S_0_1:.*]] = getelementptr inbounds %[[S]], ptr %[[S_0_0]], i64 1
+// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[START_AS_i8]], i32 noundef 1)
+// CHECK: %[[S_0_1:.*]] = getelementptr inbounds %[[S]], ptr %[[START_AS_i8]], i64 1
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_0_1]], i32 noundef 2)
-// CHECK: %[[S_0_2:.*]] = getelementptr inbounds %[[S]], ptr %[[S_0_1]], i64 1
+// CHECK: %[[S_0_2:.*]] = getelementptr inbounds %[[S]], ptr %[[START_AS_i8]], i64 2
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_0_2]], i32 noundef 3)
 //
 // { 4, 5, 6 }
 //
 // CHECK: %[[S_1:.*]] = getelementptr inbounds [3 x %[[S]]], ptr %[[START_AS_i8]], i64 1
 //
-// CHECK: %[[S_1_0:.*]] = getelementptr inbounds [3 x %[[S]]], ptr %[[S_1]], i64 0, i64 0
-// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1_0]], i32 noundef 4)
-// CHECK: %[[S_1_1:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1_0]], i64 1
+// CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1]], i32 noundef 4)
+// CHECK: %[[S_1_1:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1]], i64 1
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1_1]], i32 noundef 5)
-// CHECK: %[[S_1_2:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1_1]], i64 1
+// CHECK: %[[S_1_2:.*]] = getelementptr inbounds %[[S]], ptr %[[S_1]], i64 2
 // CHECK: call void @_ZN1SC1Ei(ptr {{[^,]*}} %[[S_1_2]], i32 noundef 6)
 //
 // And the rest.
@@ -114,13 +110,12 @@ void *r = new T[n][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 // { 1, 2, 3 }
 //
 //
-// CHECK: %[[T_0_0:.*]] = getelementptr inbounds [3 x %[[T:.*]]], ptr %[[ALLOC]], i64 0, i64 0
-// CHECK: %[[T_0_0_0:.*]] = getelementptr inbounds %[[T]], ptr %[[T_0_0]], i32 0, i32 0
+// CHECK: %[[T_0_0_0:.*]] = getelementptr inbounds %[[T:.+]], ptr %[[ALLOC]], i32 0, i32 0
 // CHECK: store i32 1, ptr %[[T_0_0_0]]
-// CHECK: %[[T_0_1:.*]] = getelementptr inbounds %[[T]], ptr %[[T_0_0]], i64 1
+// CHECK: %[[T_0_1:.*]] = getelementptr inbounds %[[T]], ptr %[[ALLOC]], i64 1
 // CHECK: %[[T_0_1_0:.*]] = getelementptr inbounds %[[T]], ptr %[[T_0_1]], i32 0, i32 0
 // CHECK: store i32 2, ptr %[[T_0_1_0]]
-// CHECK: %[[T_0_2:.*]] = getelementptr inbounds %[[T]], ptr %[[T_0_1]], i64 1
+// CHECK: %[[T_0_2:.*]] = getelementptr inbounds %[[T]], ptr %[[ALLOC]], i64 2
 // CHECK: %[[T_0_2_0:.*]] = getelementptr inbounds %[[T]], ptr %[[T_0_2]], i32 0, i32 0
 // CHECK: store i32 3, ptr %[[T_0_2_0]]
 //
@@ -128,13 +123,12 @@ void *r = new T[n][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 //
 // CHECK: %[[T_1:.*]] = getelementptr inbounds [3 x %[[T]]], ptr %[[ALLOC]], i64 1
 //
-// CHECK: %[[T_1_0:.*]] = getelementptr inbounds [3 x %[[T]]], ptr %[[T_1]], i64 0, i64 0
-// CHECK: %[[T_1_0_0:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1_0]], i32 0, i32 0
+// CHECK: %[[T_1_0_0:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1]], i32 0, i32 0
 // CHECK: store i32 4, ptr %[[T_1_0_0]]
-// CHECK: %[[T_1_1:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1_0]], i64 1
+// CHECK: %[[T_1_1:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1]], i64 1
 // CHECK: %[[T_1_1_0:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1_1]], i32 0, i32 0
 // CHECK: store i32 5, ptr %[[T_1_1_0]]
-// CHECK: %[[T_1_2:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1_1]], i64 1
+// CHECK: %[[T_1_2:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1]], i64 2
 // CHECK: %[[T_1_2_0:.*]] = getelementptr inbounds %[[T]], ptr %[[T_1_2]], i32 0, i32 0
 // CHECK: store i32 6, ptr %[[T_1_2_0]]
 //
diff --git a/clang/test/CodeGenCXX/partial-destruction.cpp b/clang/test/CodeGenCXX/partial-destruction.cpp
index 840e438..8ceb4b9 100644
--- a/clang/test/CodeGenCXX/partial-destruction.cpp
+++ b/clang/test/CodeGenCXX/partial-destruction.cpp
@@ -20,15 +20,14 @@ namespace test0 {
   // CHECK-NEXT: [[SEL:%.*]] = alloca i32
 
   // Initialize.
-  // CHECK-NEXT: [[E_BEGIN:%.*]] = getelementptr inbounds [10 x [[A]]], ptr [[AS]], i64 0, i64 0
-  // CHECK-NEXT: store ptr [[E_BEGIN]], ptr [[ENDVAR]]
-  // CHECK-NEXT: invoke void @_ZN5test01AC1Ei(ptr {{[^,]*}} [[E_BEGIN]], i32 noundef 5)
-  // CHECK:      [[E1:%.*]] = getelementptr inbounds [[A]], ptr [[E_BEGIN]], i64 1
+  // CHECK-NEXT: store ptr [[AS]], ptr [[ENDVAR]]
+  // CHECK-NEXT: invoke void @_ZN5test01AC1Ei(ptr {{[^,]*}} [[AS]], i32 noundef 5)
+  // CHECK:      [[E1:%.*]] = getelementptr inbounds [[A]], ptr [[AS]], i64 1
   // CHECK-NEXT: store ptr [[E1]], ptr [[ENDVAR]]
   // CHECK-NEXT: invoke void @_ZN5test01AC1Ei(ptr {{[^,]*}} [[E1]], i32 noundef 7)
-  // CHECK:      [[E2:%.*]] = getelementptr inbounds [[A]], ptr [[E1]], i64 1
+  // CHECK:      [[E2:%.*]] = getelementptr inbounds [[A]], ptr [[AS]], i64 2
   // CHECK-NEXT: store ptr [[E2]], ptr [[ENDVAR]]
-  // CHECK-NEXT: [[E_END:%.*]] = getelementptr inbounds [[A]], ptr [[E_BEGIN]], i64 10
+  // CHECK-NEXT: [[E_END:%.*]] = getelementptr inbounds [[A]], ptr [[AS]], i64 10
   // CHECK-NEXT: br label
   // CHECK:      [[E_CUR:%.*]] = phi ptr [ [[E2]], {{%.*}} ], [ [[E_NEXT:%.*]], {{%.*}} ]
   // CHECK-NEXT: invoke void @_ZN5test01AC1Ev(ptr {{[^,]*}} [[E_CUR]])
@@ -56,13 +55,13 @@ namespace test0 {
   // CHECK:      landingpad { ptr, i32 }
   // CHECK-NEXT:   cleanup
   // CHECK:      [[PARTIAL_END:%.*]] = load ptr, ptr [[ENDVAR]]
-  // CHECK-NEXT: [[T0:%.*]] = icmp eq ptr [[E_BEGIN]], [[PARTIAL_END]]
+  // CHECK-NEXT: [[T0:%.*]] = icmp eq ptr [[AS]], [[PARTIAL_END]]
   // CHECK-NEXT: br i1 [[T0]],
   // CHECK:      [[E_AFTER:%.*]] = phi ptr [ [[PARTIAL_END]], {{%.*}} ], [ [[E_CUR:%.*]], {{%.*}} ]
   // CHECK-NEXT: [[E_CUR]] = getelementptr inbounds [[A]], ptr [[E_AFTER]], i64 -1
   // CHECKv03-NEXT: invoke void @_ZN5test01AD1Ev(ptr {{[^,]*}} [[E_CUR]])
   // CHECKv11-NEXT: call   void @_ZN5test01AD1Ev(ptr {{[^,]*}} [[E_CUR]])
-  // CHECK:      [[T0:%.*]] = icmp eq ptr [[E_CUR]], [[E_BEGIN]]
+  // CHECK:      [[T0:%.*]] = icmp eq ptr [[E_CUR]], [[AS]]
   // CHECK-NEXT: br i1 [[T0]],
 
   // Primary EH destructor.
@@ -189,25 +188,22 @@ namespace test4 {
 }
 // CHECK-LABEL: define{{.*}} void @_ZN5test44testEv()
 // CHECK:       [[ARRAY:%.*]] = alloca [2 x [3 x [[A:%.*]]]], align
-// CHECK:       [[A0:%.*]] = getelementptr inbounds [2 x [3 x [[A]]]], ptr [[ARRAY]], i64 0, i64 0
-// CHECK-NEXT:  store ptr [[A0]],
-// CHECK-NEXT:  [[A00:%.*]] = getelementptr inbounds [3 x [[A]]], ptr [[A0]], i64 0, i64 0
-// CHECK-NEXT:  store ptr [[A00]],
-// CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[A00]], i32 noundef 0)
-// CHECK:       [[A01:%.*]] = getelementptr inbounds [[A]], ptr [[A00]], i64 1
+// CHECK:       store ptr [[ARRAY]],
+// CHECK-NEXT:  store ptr [[ARRAY]],
+// CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[ARRAY]], i32 noundef 0)
+// CHECK:       [[A01:%.*]] = getelementptr inbounds [[A]], ptr [[ARRAY]], i64 1
 // CHECK-NEXT:  store ptr [[A01]],
 // CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[A01]], i32 noundef 1)
-// CHECK:       [[A02:%.*]] = getelementptr inbounds [[A]], ptr [[A01]], i64 1
+// CHECK:       [[A02:%.*]] = getelementptr inbounds [[A]], ptr [[ARRAY]], i64 2
 // CHECK-NEXT:  store ptr [[A02]],
 // CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[A02]], i32 noundef 2)
-// CHECK:       [[A1:%.*]] = getelementptr inbounds [3 x [[A]]], ptr [[A0]], i64 1
+// CHECK:       [[A1:%.*]] = getelementptr inbounds [3 x [[A]]], ptr [[ARRAY]], i64 1
 // CHECK-NEXT:  store ptr [[A1]],
-// CHECK-NEXT:  [[A10:%.*]] = getelementptr inbounds [3 x [[A]]], ptr [[A1]], i64 0, i64 0
-// CHECK-NEXT:  store ptr [[A10]],
-// CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[A10]], i32 noundef 3)
-// CHECK:       [[A11:%.*]] = getelementptr inbounds [[A]], ptr [[A10]], i64 1
+// CHECK-NEXT:  store ptr [[A1]],
+// CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[A1]], i32 noundef 3)
+// CHECK:       [[A11:%.*]] = getelementptr inbounds [[A]], ptr [[A1]], i64 1
 // CHECK-NEXT:  store ptr [[A11]],
 // CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[A11]], i32 noundef 4)
-// CHECK:       [[A12:%.*]] = getelementptr inbounds [[A]], ptr [[A11]], i64 1
+// CHECK:       [[A12:%.*]] = getelementptr inbounds [[A]], ptr [[A1]], i64 2
 // CHECK-NEXT:  store ptr [[A12]],
 // CHECK-NEXT:  invoke void @_ZN5test41AC1Ej(ptr {{[^,]*}} [[A12]], i32 noundef 5)
diff --git a/clang/test/CodeGenCXX/temporaries.cpp b/clang/test/CodeGenCXX/temporaries.cpp
index 9f29d3b..186f493 100644
--- a/clang/test/CodeGenCXX/temporaries.cpp
+++ b/clang/test/CodeGenCXX/temporaries.cpp
@@ -477,9 +477,8 @@ namespace Elision {
     // CHECK-NEXT: call void @_ZN7Elision1AC1Ev(ptr {{[^,]*}} [[X]])
     A x;
 
-    // CHECK-NEXT: [[XS0:%.*]] = getelementptr inbounds [2 x [[A]]], ptr [[XS]], i64 0, i64 0
-    // CHECK-NEXT: call void @_ZN7Elision1AC1Ev(ptr {{[^,]*}} [[XS0]])
-    // CHECK-NEXT: [[XS1:%.*]] = getelementptr inbounds [[A]], ptr [[XS0]], i64 1
+    // CHECK-NEXT: call void @_ZN7Elision1AC1Ev(ptr {{[^,]*}} [[XS]])
+    // CHECK-NEXT: [[XS1:%.*]] = getelementptr inbounds [[A]], ptr [[XS]], i64 1
     // CHECK-NEXT: call void @_ZN7Elision1AC1ERKS0_(ptr {{[^,]*}} [[XS1]], ptr noundef {{(nonnull )?}}align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[X]])
     A xs[] = { A(), x };
 
diff --git a/clang/test/CodeGenCXX/value-init.cpp b/clang/test/CodeGenCXX/value-init.cpp
index 42181be..9e72769 100644
--- a/clang/test/CodeGenCXX/value-init.cpp
+++ b/clang/test/CodeGenCXX/value-init.cpp
@@ -205,11 +205,9 @@ namespace test6 {
   // CHECK-LABEL:    define{{.*}} void @_ZN5test64testEv()
   // CHECK:      [[ARR:%.*]] = alloca [10 x [20 x [[A:%.*]]]],
 
-  // CHECK-NEXT: [[INNER:%.*]] = getelementptr inbounds [10 x [20 x [[A]]]], ptr [[ARR]], i64 0, i64 0
-  // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [20 x [[A]]], ptr [[INNER]], i64 0, i64 0
-  // CHECK-NEXT: call void @_ZN5test61AC1Ei(ptr {{[^,]*}} [[T0]], i32 noundef 5)
-  // CHECK-NEXT: [[BEGIN:%.*]] = getelementptr inbounds [[A]], ptr [[T0]], i64 1
-  // CHECK-NEXT: [[END:%.*]] = getelementptr inbounds [[A]], ptr [[T0]], i64 20
+  // CHECK-NEXT: call void @_ZN5test61AC1Ei(ptr {{[^,]*}} [[ARR]], i32 noundef 5)
+  // CHECK-NEXT: [[BEGIN:%.*]] = getelementptr inbounds [[A]], ptr [[ARR]], i64 1
+  // CHECK-NEXT: [[END:%.*]] = getelementptr inbounds [[A]], ptr [[ARR]], i64 20
   // CHECK-NEXT: br label
   // CHECK:      [[CUR:%.*]] = phi ptr [ [[BEGIN]], {{%.*}} ], [ [[NEXT:%.*]], {{%.*}} ]
   // CHECK-NEXT: call void @_ZN5test61AC1Ev(ptr {{[^,]*}} [[CUR]])
@@ -217,13 +215,15 @@ namespace test6 {
   // CHECK-NEXT: [[T0:%.*]] = icmp eq ptr [[NEXT]], [[END]]
   // CHECK-NEXT: br i1
 
-  // CHECK:      [[BEGIN:%.*]] = getelementptr inbounds [20 x [[A]]], ptr [[INNER]], i64 1
-  // CHECK-NEXT: [[END:%.*]] = getelementptr inbounds [20 x [[A]]], ptr [[INNER]], i64 10
+  // CHECK:      [[BEGIN:%.*]] = getelementptr inbounds [20 x [[A]]], ptr [[ARR]], i64 1
+  // CHECK-NEXT: [[END:%.*]] = getelementptr inbounds [20 x [[A]]], ptr [[ARR]], i64 10
   // CHECK-NEXT: br label
-  // CHECK:      [[CUR:%.*]] = phi ptr [ [[BEGIN]], {{%.*}} ], [ [[NEXT:%.*]], {{%.*}} ]
-
   // Inner loop.
-  // CHECK-NEXT: [[IBEGIN:%.*]] = getelementptr inbounds [20 x [[A]]], ptr [[CUR]], i{{32|64}} 0, i{{32|64}} 0
+  // CHECK-CXX98: [[CUR:%.*]] = phi ptr [ [[BEGIN]], {{%.*}} ], [ [[NEXT:%.*]], {{%.*}} ]
+
+  // CHECK-CXX98: [[IBEGIN:%.*]] = getelementptr inbounds [20 x [[A]]], ptr [[CUR]], i{{32|64}} 0, i{{32|64}} 0
+  // CHECK-CXX17: [[IBEGIN:%.*]] = phi ptr [ [[BEGIN]], {{%.*}} ], [ [[NEXT:%.*]], {{%.*}} ]
+
   // CHECK-NEXT: [[IEND:%.*]] = getelementptr inbounds [[A]], ptr [[IBEGIN]], i64 20
   // CHECK-NEXT: br label
   // CHECK:      [[ICUR:%.*]] = phi ptr [ [[IBEGIN]], {{%.*}} ], [ [[INEXT:%.*]], {{%.*}} ]
@@ -232,7 +232,8 @@ namespace test6 {
   // CHECK-NEXT: [[T0:%.*]] = icmp eq ptr [[INEXT]], [[IEND]]
   // CHECK-NEXT: br i1 [[T0]],
 
-  // CHECK:      [[NEXT]] = getelementptr inbounds [20 x [[A]]], ptr [[CUR]], i64 1
+  // CHECK-CXX98: [[NEXT]] = getelementptr inbounds [20 x [[A]]], ptr [[CUR]], i64 1
+  // CHECK-CXX17: [[NEXT]] = getelementptr inbounds [20 x [[A]]], ptr [[IBEGIN]], i64 1
   // CHECK-NEXT: [[T0:%.*]] = icmp eq ptr [[NEXT]], [[END]]
   // CHECK-NEXT: br i1 [[T0]]
   // CHECK:      ret void
diff --git a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
index 06cc206..d71c2c5 100644
--- a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
+++ b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
@@ -43,14 +43,11 @@ coroutine ArrayInitCoro() {
   // CHECK: %cleanup.isactive = alloca i1, align 1
   Printy arr[2] = {
     Printy("a"),
-    // CHECK:       %arrayinit.begin = getelementptr inbounds [2 x %struct.Printy], ptr %arr.reload.addr, i64 0, i64 0
-    // CHECK-NEXT:  %arrayinit.begin.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10
-    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.begin.spill.addr, align 8
-    // CHECK-NEXT:  store i1 true, ptr %cleanup.isactive.reload.addr, align 1
-    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.endOfInit.reload.addr, align 8
-    // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
-    // CHECK-NEXT:  %arrayinit.element = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
-    // CHECK-NEXT:  %arrayinit.element.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+    // CHECK:       store i1 true, ptr %cleanup.isactive.reload.addr, align 1
+    // CHECK-NEXT:  store ptr %arr.reload.addr, ptr %arrayinit.endOfInit.reload.addr, align 8
+    // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arr.reload.addr, ptr noundef @.str)
+    // CHECK-NEXT:  %arrayinit.element = getelementptr inbounds %struct.Printy, ptr %arr.reload.addr, i64 1
+    // CHECK-NEXT:  %arrayinit.element.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10
     // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.element.spill.addr, align 8
     // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.endOfInit.reload.addr, align 8
     co_await Awaiter{}
@@ -64,7 +61,7 @@ coroutine ArrayInitCoro() {
   // CHECK:         br label %cleanup{{.*}}
 
   // CHECK:       await.ready:
-  // CHECK-NEXT:    %arrayinit.element.reload.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+  // CHECK-NEXT:    %arrayinit.element.reload.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10
   // CHECK-NEXT:    %arrayinit.element.reload = load ptr, ptr %arrayinit.element.reload.addr, align 8
   // CHECK-NEXT:    call void @_ZN7Awaiter12await_resumeEv
   // CHECK-NEXT:    store i1 false, ptr %cleanup.isactive.reload.addr, align 1
@@ -75,7 +72,7 @@ coroutine ArrayInitCoro() {
   // CHECK-NEXT:    br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
 
   // CHECK:       cleanup.action:
-  // CHECK:         %arraydestroy.isempty = icmp eq ptr %arrayinit.begin.reload{{.*}}, %{{.*}}
+  // CHECK:         %arraydestroy.isempty = icmp eq ptr %arr.reload.addr, %{{.*}}
   // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body.from.cleanup.action
   // Ignore rest of the array cleanup.
 }
diff --git a/clang/test/CodeGenObjC/arc-ternary-op.m b/clang/test/CodeGenObjC/arc-ternary-op.m
index 87167d9..d633851 100644
--- a/clang/test/CodeGenObjC/arc-ternary-op.m
+++ b/clang/test/CodeGenObjC/arc-ternary-op.m
@@ -153,22 +153,20 @@ void test3(int cond) {
   // CHECK: %[[_COMPOUNDLITERAL1:.*]] = alloca [2 x ptr], align 8
   // CHECK: %[[CLEANUP_COND4:.*]] = alloca i1, align 1
 
-  // CHECK: %[[ARRAYINIT_BEGIN:.*]] = getelementptr inbounds [2 x ptr], ptr %[[_COMPOUNDLITERAL]], i64 0, i64 0
   // CHECK: %[[V2:.*]] = load ptr, ptr @g0, align 8
   // CHECK: %[[V3:.*]] = call ptr @llvm.objc.retain(ptr %[[V2]])
-  // CHECK: store ptr %[[V3]], ptr %[[ARRAYINIT_BEGIN]], align 8
-  // CHECK: %[[ARRAYINIT_ELEMENT:.*]] = getelementptr inbounds ptr, ptr %[[ARRAYINIT_BEGIN]], i64 1
+  // CHECK: store ptr %[[V3]], ptr %[[_COMPOUNDLITERAL]], align 8
+  // CHECK: %[[ARRAYINIT_ELEMENT:.*]] = getelementptr inbounds ptr, ptr %[[_COMPOUNDLITERAL]], i64 1
   // CHECK: %[[V4:.*]] = load ptr, ptr @g1, align 8
   // CHECK: %[[V5:.*]] = call ptr @llvm.objc.retain(ptr %[[V4]])
   // CHECK: store ptr %[[V5]], ptr %[[ARRAYINIT_ELEMENT]], align 8
   // CHECK: store i1 true, ptr %[[CLEANUP_COND]], align 1
   // CHECK: %[[ARRAYDECAY:.*]] = getelementptr inbounds [2 x ptr], ptr %[[_COMPOUNDLITERAL]], i64 0, i64 0
 
-  // CHECK: %[[ARRAYINIT_BEGIN2:.*]] = getelementptr inbounds [2 x ptr], ptr %[[_COMPOUNDLITERAL1]], i64 0, i64 0
   // CHECK: %[[V6:.*]] = load ptr, ptr @g1, align 8
   // CHECK: %[[V7:.*]] = call ptr @llvm.objc.retain(ptr %[[V6]])
-  // CHECK: store ptr %[[V7]], ptr %[[ARRAYINIT_BEGIN2]], align 8
-  // CHECK: %[[ARRAYINIT_ELEMENT3:.*]] = getelementptr inbounds ptr, ptr %[[ARRAYINIT_BEGIN2]], i64 1
+  // CHECK: store ptr %[[V7]], ptr %[[_COMPOUNDLITERAL1]], align 8
+  // CHECK: %[[ARRAYINIT_ELEMENT3:.*]] = getelementptr inbounds ptr, ptr %[[_COMPOUNDLITERAL1]], i64 1
   // CHECK: %[[V8:.*]] = load ptr, ptr @g0, align 8
   // CHECK: %[[V9:.*]] = call ptr @llvm.objc.retain(ptr %[[V8]])
   // CHECK: store ptr %[[V9]], ptr %[[ARRAYINIT_ELEMENT3]], align 8
diff --git a/clang/test/CodeGenObjC/arc.m b/clang/test/CodeGenObjC/arc.m
index aeead58..48ca14a 100644
--- a/clang/test/CodeGenObjC/arc.m
+++ b/clang/test/CodeGenObjC/arc.m
@@ -1377,11 +1377,10 @@ void test71(void) {
 // CHECK: %[[T:.*]] = alloca [2 x ptr], align 16
 // CHECK: %[[V0:.*]] = call ptr @llvm.objc.retain(ptr %[[A]])
 // CHECK: %[[V1:.*]] = call ptr @llvm.objc.retain(ptr %[[B]]) #2
-// CHECK: %[[ARRAYINIT_BEGIN:.*]] = getelementptr inbounds [2 x ptr], ptr %[[T]], i64 0, i64 0
 // CHECK: %[[V3:.*]] = load ptr, ptr %[[A_ADDR]], align 8, !tbaa !7
 // CHECK: %[[V4:.*]] = call ptr @llvm.objc.retain(ptr %[[V3]]) #2
-// CHECK: store ptr %[[V4]], ptr %[[ARRAYINIT_BEGIN]], align 8, !tbaa !7
-// CHECK: %[[ARRAYINIT_ELEMENT:.*]] = getelementptr inbounds ptr, ptr %[[ARRAYINIT_BEGIN]], i64 1
+// CHECK: store ptr %[[V4]], ptr %[[T]], align 8, !tbaa !7
+// CHECK: %[[ARRAYINIT_ELEMENT:.*]] = getelementptr inbounds ptr, ptr %[[T]], i64 1
 // CHECK: %[[V5:.*]] = load ptr, ptr %[[B_ADDR]], align 8, !tbaa !7
 // CHECK: %[[V6:.*]] = call ptr @llvm.objc.retain(ptr %[[V5]]) #2
 // CHECK: store ptr %[[V6]], ptr %[[ARRAYINIT_ELEMENT]], align 8, !tbaa !7
diff --git a/clang/test/CodeGenObjCXX/arc-exceptions.mm b/clang/test/CodeGenObjCXX/arc-exceptions.mm
index 709afa3..3efe566 100644
--- a/clang/test/CodeGenObjCXX/arc-exceptions.mm
+++ b/clang/test/CodeGenObjCXX/arc-exceptions.mm
@@ -115,23 +115,20 @@ void test5(void) {
 }
 // CHECK-LABEL: define{{.*}} void @_Z5test5v()
 // CHECK:       [[ARRAY:%.*]] = alloca [2 x [2 x ptr]], align
-// CHECK:       [[A0:%.*]] = getelementptr inbounds [2 x [2 x ptr]], ptr [[ARRAY]], i64 0, i64 0
-// CHECK-NEXT:  store ptr [[A0]],
-// CHECK-NEXT:  [[A00:%.*]] = getelementptr inbounds [2 x ptr], ptr [[A0]], i64 0, i64 0
-// CHECK-NEXT:  store ptr [[A00]],
+// CHECK:       store ptr [[ARRAY]],
+// CHECK-NEXT:  store ptr [[ARRAY]],
 // CHECK-NEXT:  [[T0:%.*]] = invoke noundef ptr @_Z12test5_helperj(i32 noundef 0)
-// CHECK:       store ptr [[T0]], ptr [[A00]], align
-// CHECK-NEXT:  [[A01:%.*]] = getelementptr inbounds ptr, ptr [[A00]], i64 1
+// CHECK:       store ptr [[T0]], ptr [[ARRAY]], align
+// CHECK-NEXT:  [[A01:%.*]] = getelementptr inbounds ptr, ptr [[ARRAY]], i64 1
 // CHECK-NEXT:  store ptr [[A01]],
 // CHECK-NEXT:  [[T0:%.*]] = invoke noundef ptr @_Z12test5_helperj(i32 noundef 1)
 // CHECK:       store ptr [[T0]], ptr [[A01]], align
-// CHECK-NEXT:  [[A1:%.*]] = getelementptr inbounds [2 x ptr], ptr [[A0]], i64 1
+// CHECK-NEXT:  [[A1:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ARRAY]], i64 1
+// CHECK-NEXT:  store ptr [[A1]],
 // CHECK-NEXT:  store ptr [[A1]],
-// CHECK-NEXT:  [[A10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[A1]], i64 0, i64 0
-// CHECK-NEXT:  store ptr [[A10]],
 // CHECK-NEXT:  [[T0:%.*]] = invoke noundef ptr @_Z12test5_helperj(i32 noundef 2)
-// CHECK:       store ptr [[T0]], ptr [[A10]], align
-// CHECK-NEXT:  [[A11:%.*]] = getelementptr inbounds ptr, ptr [[A10]], i64 1
+// CHECK:       store ptr [[T0]], ptr [[A1]], align
+// CHECK-NEXT:  [[A11:%.*]] = getelementptr inbounds ptr, ptr [[A1]], i64 1
 // CHECK-NEXT:  store ptr [[A11]],
 // CHECK-NEXT:  [[T0:%.*]] = invoke noundef ptr @_Z12test5_helperj(i32 noundef 3)
 // CHECK:       store ptr [[T0]], ptr [[A11]], align
diff --git a/clang/test/Driver/aarch64-mac-cpus.c b/clang/test/Driver/aarch64-mac-cpus.c
index 5179731..488298c 100644
--- a/clang/test/Driver/aarch64-mac-cpus.c
+++ b/clang/test/Driver/aarch64-mac-cpus.c
@@ -16,7 +16,7 @@
 // RUN: %clang --target=arm64-apple-macos -mcpu=apple-m1 -### -c %s 2>&1 | FileCheck --check-prefix=EXPLICIT-M1 %s
 
 // CHECK: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-m1"
-// CHECK-SAME: "-target-feature" "+v8.5a"
+// CHECK-SAME: "-target-feature" "+v8.4a"
 
 // EXPLICIT-A11: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-a11"
 // EXPLICIT-A7: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "apple-a7"
diff --git a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
index 567d121..6a15cae 100644
--- a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
@@ -493,9 +493,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -798,9 +797,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1158,9 +1156,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1461,9 +1458,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
index 5d1a12d..510c5d4 100644
--- a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
@@ -478,9 +478,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -801,9 +800,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1180,9 +1178,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1501,9 +1498,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
index faae599..11b18e8 100644
--- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
@@ -776,9 +776,8 @@ int main() {
 // CHECK8-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK8-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK8-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK8-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK8-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK8-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK8-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK8-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK8-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1211,9 +1210,8 @@ int main() {
 // CHECK8-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK8-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK8-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK8-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK8-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK8-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK8-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK8-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK8-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1695,9 +1693,8 @@ int main() {
 // CHECK10-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK10-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK10-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK10-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK10-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK10-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK10-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK10-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2124,9 +2121,8 @@ int main() {
 // CHECK10-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK10-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK10-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK10-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK10-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK10-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK10-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK10-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
index 18a86c5..29e1a35 100644
--- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
@@ -752,9 +752,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1223,9 +1222,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1744,9 +1742,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2209,9 +2206,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
index 0969a0c..03c62ce 100644
--- a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
@@ -514,9 +514,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -835,9 +834,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1225,9 +1223,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1540,9 +1537,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
index 545ea9f..cb2cbb0 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -839,9 +839,8 @@ int main() {
 // CHECK8-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK8-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK8-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK8-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK8-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK8-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK8-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK8-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK8-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1288,9 +1287,8 @@ int main() {
 // CHECK8-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK8-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK8-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK8-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK8-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK8-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK8-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK8-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK8-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK8-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1786,9 +1784,8 @@ int main() {
 // CHECK10-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK10-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK10-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK10-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK10-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK10-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK10-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK10-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2229,9 +2226,8 @@ int main() {
 // CHECK10-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK10-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK10-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK10-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK10-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK10-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK10-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK10-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK10-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2721,9 +2717,8 @@ int main() {
 // CHECK12-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK12-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK12-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK12-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK12-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK12-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK12-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK12-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK12-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK12-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2828,9 +2823,8 @@ int main() {
 // CHECK12-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK12-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK12-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK12-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK12-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK12-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK12-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK12-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK12-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK12-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -3026,9 +3020,8 @@ int main() {
 // CHECK14-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK14-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK14-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK14-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK14-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK14-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK14-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK14-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK14-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK14-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK14-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -3131,9 +3124,8 @@ int main() {
 // CHECK14-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK14-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK14-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK14-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK14-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK14-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK14-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK14-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK14-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK14-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK14-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
index 675f131..c8c230b 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -819,9 +819,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1304,9 +1303,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1839,9 +1837,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2318,9 +2315,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2853,9 +2849,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -3021,9 +3016,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -3260,9 +3254,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -3426,9 +3419,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
index a74af2d..f884bab 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
@@ -568,9 +568,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -903,9 +902,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1307,9 +1305,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1636,9 +1633,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -2043,9 +2039,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -2182,9 +2177,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -2393,9 +2387,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -2530,9 +2523,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/distribute_private_codegen.cpp b/clang/test/OpenMP/distribute_private_codegen.cpp
index b6e796f..7544d42 100644
--- a/clang/test/OpenMP/distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_private_codegen.cpp
@@ -344,9 +344,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -661,9 +660,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -950,9 +948,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1265,9 +1262,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
index 1b5950c..35284a5 100644
--- a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
@@ -546,9 +546,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -858,9 +857,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1225,9 +1223,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1535,9 +1532,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1900,9 +1896,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2007,9 +2002,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2205,9 +2199,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2310,9 +2303,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
index 92b73b2..33c488e 100644
--- a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
@@ -533,9 +533,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -863,9 +862,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1249,9 +1247,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1577,9 +1574,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1967,9 +1963,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2135,9 +2130,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2374,9 +2368,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2540,9 +2533,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/distribute_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
index 93b2bd8..534903e0 100644
--- a/clang/test/OpenMP/distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
@@ -389,9 +389,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -735,9 +734,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1035,9 +1033,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1379,9 +1376,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1685,9 +1681,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1849,9 +1844,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -2066,9 +2060,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -2228,9 +2221,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/error_unsupport_feature.c b/clang/test/OpenMP/error_unsupport_feature.c
new file mode 100644
index 0000000..eb381b3
--- /dev/null
+++ b/clang/test/OpenMP/error_unsupport_feature.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -emit-llvm-only -verify -fopenmp %s
+
+int main () {
+  int r = 0;
+#pragma omp scope reduction(+:r) // expected-error {{cannot compile this scope with FE outlining yet}}
+  r++;
+  return r;
+}
diff --git a/clang/test/OpenMP/for_firstprivate_codegen.cpp b/clang/test/OpenMP/for_firstprivate_codegen.cpp
index 79f76bd..6f51b82 100644
--- a/clang/test/OpenMP/for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/for_firstprivate_codegen.cpp
@@ -423,9 +423,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
diff --git a/clang/test/OpenMP/for_lastprivate_codegen.cpp b/clang/test/OpenMP/for_lastprivate_codegen.cpp
index c7ef60a..f89969d 100644
--- a/clang/test/OpenMP/for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/for_lastprivate_codegen.cpp
@@ -374,9 +374,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 5, ptr @main.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], ptr @_ZZ4mainE5sivar)
@@ -848,9 +847,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 128
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 128
@@ -2798,9 +2796,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK5-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 5, ptr @main.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], ptr @_ZZ4mainE5sivar)
@@ -3290,9 +3287,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 128
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 128
diff --git a/clang/test/OpenMP/for_private_codegen.cpp b/clang/test/OpenMP/for_private_codegen.cpp
index 463e530..d140f0f 100644
--- a/clang/test/OpenMP/for_private_codegen.cpp
+++ b/clang/test/OpenMP/for_private_codegen.cpp
@@ -121,9 +121,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @main.omp_outlined)
@@ -360,9 +359,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 0, ptr @_Z5tmainIiET_v.omp_outlined)
diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp
index d025c6d..498845d 100644
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@@ -556,13 +556,12 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store float 0.000000e+00, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_ELEMENT]], i64 1
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 2
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT1]], float noundef 3.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_ELEMENT1]], i64 1
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 3
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT2]], float noundef 4.000000e+00)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR1]])
@@ -3377,9 +3376,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiLi42EET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR1]])
diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
index 999e062..f0ab531e 100644
--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -622,13 +622,12 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[TEST]])
 // CHECK1-NEXT:    store float 0.000000e+00, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT1:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_ELEMENT]], i64 1
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT1:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 2
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT1]], float noundef 3.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_ELEMENT1]], i64 1
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 3
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT2]], float noundef 4.000000e+00)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[VAR1]])
@@ -2421,9 +2420,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiLi42EET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(12) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[VAR1]])
@@ -3318,13 +3316,12 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[TEST]])
 // CHECK3-NEXT:    store float 0.000000e+00, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK3-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK3-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK3-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT1:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_ELEMENT]], i64 1
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT1:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 2
 // CHECK3-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT1]], float noundef 3.000000e+00)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_ELEMENT1]], i64 1
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 3
 // CHECK3-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT2]], float noundef 4.000000e+00)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK3-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[VAR1]])
@@ -3584,9 +3581,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiLi42EET_v.vec, i64 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(12) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(12) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(12) [[VAR1]])
diff --git a/clang/test/OpenMP/parallel_copyin_codegen.cpp b/clang/test/OpenMP/parallel_copyin_codegen.cpp
index 57a563d..e653a77 100644
--- a/clang/test/OpenMP/parallel_copyin_codegen.cpp
+++ b/clang/test/OpenMP/parallel_copyin_codegen.cpp
@@ -297,9 +297,8 @@ void foo() {
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[TMP1]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    ret ptr [[TMP2]]
@@ -574,9 +573,8 @@ void foo() {
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[TMP1]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    ret ptr [[TMP2]]
diff --git a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
index e5374c3..1cfba8c 100644
--- a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -258,9 +258,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], float 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[VAR]], float 3.000000e+00)
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
@@ -473,9 +472,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], i32 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 3)
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
@@ -1440,9 +1438,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], float 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float 2.000000e+00)
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[VAR]], float 3.000000e+00)
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
@@ -1655,9 +1652,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], i32 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 2)
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 3)
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
diff --git a/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp
index fa63185..bfb31c3 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_firstprivate_codegen.cpp
@@ -206,9 +206,8 @@ void array_func(int n, float a[n], St s[2]) {
 // CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]], ptr noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
 // CHECK-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
@@ -569,9 +568,8 @@ void array_func(int n, float a[n], St s[2]) {
 // CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]], ptr noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
 // CHECK-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
@@ -1519,9 +1517,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1ERKS0_d(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]], ptr noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
 // SIMD-ONLY0-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // SIMD-ONLY0-NEXT:    store i32 0, ptr [[I]], align 4
@@ -1617,9 +1614,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1ERKS0_i(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]], ptr noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
 // SIMD-ONLY0-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // SIMD-ONLY0-NEXT:    store i32 0, ptr [[I]], align 4
@@ -1842,9 +1838,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1ERKS0_d(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]], ptr noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
 // SIMD-ONLY1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // SIMD-ONLY1-NEXT:    store i32 0, ptr [[I]], align 4
@@ -1940,9 +1935,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1ERKS0_i(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]], ptr noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
 // SIMD-ONLY1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // SIMD-ONLY1-NEXT:    store i32 0, ptr [[I]], align 4
diff --git a/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
index 409a983..f86d5377 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
@@ -212,9 +212,8 @@ void loop() {
 // CHECK1-NEXT:    call void @_ZN1SIdEC1Ev(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 4, ptr @main.omp_outlined, ptr [[VEC]], ptr [[T_VAR]], ptr [[S_ARR]], ptr [[VAR]])
@@ -580,9 +579,8 @@ void loop() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[VEC]], ptr [[T_VAR]], ptr [[S_ARR]], ptr [[VAR]])
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp
index 46c3583..68a7257 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_firstprivate_codegen.cpp
@@ -206,9 +206,8 @@ void array_func(int n, float a[n], St s[2]) {
 // CHECK-NEXT:    call void @_ZN1SIdEC1ERKS0_d(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]], ptr noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
 // CHECK-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // CHECK-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
@@ -569,9 +568,8 @@ void array_func(int n, float a[n], St s[2]) {
 // CHECK-NEXT:    call void @_ZN1SIiEC1ERKS0_i(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]], ptr noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
 // CHECK-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
@@ -1523,9 +1521,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1ERKS0_d(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]], ptr noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
 // SIMD-ONLY0-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // SIMD-ONLY0-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
@@ -1638,9 +1635,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1ERKS0_i(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]], ptr noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
 // SIMD-ONLY0-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // SIMD-ONLY0-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// SIMD-ONLY0-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // SIMD-ONLY0-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // SIMD-ONLY0-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
@@ -1880,9 +1876,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1ERKS0_d(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]], ptr noundef nonnull align 8 dereferenceable(8) [[TTT]], double noundef 0.000000e+00)
 // SIMD-ONLY1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // SIMD-ONLY1-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
@@ -1995,9 +1990,8 @@ void array_func(int n, float a[n], St s[2]) {
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1ERKS0_i(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]], ptr noundef nonnull align 4 dereferenceable(4) [[TTT]], i32 noundef 0)
 // SIMD-ONLY1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // SIMD-ONLY1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// SIMD-ONLY1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // SIMD-ONLY1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // SIMD-ONLY1-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
index 5e2c192..8495cfa 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
@@ -212,9 +212,8 @@ void loop() {
 // CHECK1-NEXT:    call void @_ZN1SIdEC1Ev(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 4, ptr @main.omp_outlined, ptr [[VEC]], ptr [[T_VAR]], ptr [[S_ARR]], ptr [[VAR]])
@@ -580,9 +579,8 @@ void loop() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[VEC]], ptr [[T_VAR]], ptr [[S_ARR]], ptr [[VAR]])
@@ -1755,9 +1753,8 @@ void loop() {
 // CHECK7-NEXT:    call void @_ZN1SIdEC1Ev(ptr noundef nonnull align 8 dereferenceable(8) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK7-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_BEGIN]], double noundef 1.000000e+00)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK7-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[S_ARR]], double noundef 1.000000e+00)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK7-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT]], double noundef 2.000000e+00)
 // CHECK7-NEXT:    call void @_ZN1SIdEC1Ed(ptr noundef nonnull align 8 dereferenceable(8) [[VAR]], double noundef 3.000000e+00)
 // CHECK7-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
@@ -1911,9 +1908,8 @@ void loop() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK7-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
diff --git a/clang/test/OpenMP/parallel_private_codegen.cpp b/clang/test/OpenMP/parallel_private_codegen.cpp
index 586399b..b86f4e4 100644
--- a/clang/test/OpenMP/parallel_private_codegen.cpp
+++ b/clang/test/OpenMP/parallel_private_codegen.cpp
@@ -182,9 +182,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 0, ptr @main.omp_outlined)
@@ -309,9 +308,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @_Z5tmainIiET_v.omp_outlined)
diff --git a/clang/test/OpenMP/parallel_reduction_codegen.cpp b/clang/test/OpenMP/parallel_reduction_codegen.cpp
index 88a10a1..d27ef491 100644
--- a/clang/test/OpenMP/parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_codegen.cpp
@@ -511,9 +511,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store float 0.000000e+00, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR1]])
@@ -997,9 +996,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR1]])
diff --git a/clang/test/OpenMP/sections_firstprivate_codegen.cpp b/clang/test/OpenMP/sections_firstprivate_codegen.cpp
index af67c42..3cf6cc2 100644
--- a/clang/test/OpenMP/sections_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/sections_firstprivate_codegen.cpp
@@ -402,9 +402,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]])
diff --git a/clang/test/OpenMP/sections_lastprivate_codegen.cpp b/clang/test/OpenMP/sections_lastprivate_codegen.cpp
index fb7f290..126a3ba 100644
--- a/clang/test/OpenMP/sections_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/sections_lastprivate_codegen.cpp
@@ -201,9 +201,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 5, ptr @main.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], ptr @_ZZ4mainE5sivar)
@@ -465,9 +464,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]])
@@ -879,9 +877,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK5-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 5, ptr @main.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], ptr @_ZZ4mainE5sivar)
@@ -1161,9 +1158,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK5-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]])
diff --git a/clang/test/OpenMP/sections_private_codegen.cpp b/clang/test/OpenMP/sections_private_codegen.cpp
index 034a8b3..ee29fd9 100644
--- a/clang/test/OpenMP/sections_private_codegen.cpp
+++ b/clang/test/OpenMP/sections_private_codegen.cpp
@@ -117,9 +117,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @main.omp_outlined)
@@ -274,9 +273,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 0, ptr @_Z5tmainIiET_v.omp_outlined)
diff --git a/clang/test/OpenMP/sections_reduction_codegen.cpp b/clang/test/OpenMP/sections_reduction_codegen.cpp
index 6166b5b..3eb7a1f 100644
--- a/clang/test/OpenMP/sections_reduction_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_codegen.cpp
@@ -196,9 +196,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store float 0.000000e+00, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR1]])
@@ -546,9 +545,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR1]])
diff --git a/clang/test/OpenMP/simd_private_taskloop_codegen.cpp b/clang/test/OpenMP/simd_private_taskloop_codegen.cpp
index 50726b2..7e81f6f 100644
--- a/clang/test/OpenMP/simd_private_taskloop_codegen.cpp
+++ b/clang/test/OpenMP/simd_private_taskloop_codegen.cpp
@@ -470,9 +470,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
@@ -850,9 +849,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.2], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_2]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_2]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
@@ -1209,9 +1207,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
@@ -1587,9 +1584,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.2], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_2]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_2]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
@@ -1955,9 +1951,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -2123,9 +2118,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -2342,9 +2336,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -2508,9 +2501,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/single_firstprivate_codegen.cpp b/clang/test/OpenMP/single_firstprivate_codegen.cpp
index f7156bf..4b4c966 100644
--- a/clang/test/OpenMP/single_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/single_firstprivate_codegen.cpp
@@ -357,9 +357,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], i32 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]])
diff --git a/clang/test/OpenMP/single_private_codegen.cpp b/clang/test/OpenMP/single_private_codegen.cpp
index e657d7b..ce79cfd 100644
--- a/clang/test/OpenMP/single_private_codegen.cpp
+++ b/clang/test/OpenMP/single_private_codegen.cpp
@@ -104,9 +104,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 0, ptr @main.omp_outlined)
@@ -226,9 +225,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @_Z5tmainIiET_v.omp_outlined)
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
index da69e19..ab0a35f 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
@@ -546,9 +546,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1346,9 +1345,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
index e23435d..adbab5d 100644
--- a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
@@ -475,9 +475,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -799,9 +798,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1176,9 +1174,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1498,9 +1495,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
index bf1fc1d..0806a71 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -737,9 +737,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1793,9 +1792,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
index ae900d2..6f14f51 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -736,9 +736,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1214,9 +1213,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1736,9 +1734,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2208,9 +2205,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
index 49e4681..05ab657 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
@@ -564,9 +564,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1291,9 +1290,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index ea05aec..fbf0b84 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -749,9 +749,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1833,9 +1832,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2856,9 +2854,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -3177,9 +3174,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index b2b9e20..cb8e091 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -764,9 +764,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1256,9 +1255,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1792,9 +1790,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2278,9 +2275,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2848,9 +2844,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -3012,9 +3007,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -3247,9 +3241,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -3409,9 +3402,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
index 0cc07c8..c7b919c 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -578,9 +578,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1333,9 +1332,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -2178,9 +2176,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK7-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -2554,9 +2551,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
index 3cb2d2c..b718173 100644
--- a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
@@ -402,9 +402,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -928,9 +927,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
index d22aeee..3e1cfb0 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
@@ -553,9 +553,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1367,9 +1366,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1911,9 +1909,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2231,9 +2228,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
index 93c570a..fc17caa 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
@@ -525,9 +525,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -856,9 +855,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1240,9 +1238,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1569,9 +1566,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1955,9 +1951,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2118,9 +2113,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2352,9 +2346,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2513,9 +2506,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
index 4e9b4e1..c1f3530 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
@@ -409,9 +409,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -949,9 +948,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1377,9 +1375,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1753,9 +1750,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
index 9b3d77f..5efad5b 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
@@ -463,9 +463,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -989,9 +988,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
index 8b3e657..2ebedb8 100644
--- a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
@@ -549,9 +549,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1351,9 +1350,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
index 66c952f..96ba444 100644
--- a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
@@ -469,9 +469,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -789,9 +788,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1165,9 +1163,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1483,9 +1480,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
index 0726fb6..8c411de 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -713,9 +713,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1771,9 +1770,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
index 06fc87d..3e4afe9 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -717,9 +717,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1185,9 +1184,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1703,9 +1701,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2165,9 +2162,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
index 4d10d16..c9535d1 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
@@ -526,9 +526,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1253,9 +1252,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 7d9e2ab..7c13f55 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -730,9 +730,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1816,9 +1815,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2496,9 +2494,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2819,9 +2816,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index 8643cb9..77a5128 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -790,9 +790,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1272,9 +1271,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1804,9 +1802,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2280,9 +2277,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2811,9 +2807,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2977,9 +2972,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -3214,9 +3208,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -3378,9 +3371,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
index f40acb4..4d0c7f0 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -542,9 +542,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1297,9 +1296,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1832,9 +1830,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -2208,9 +2205,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_private_codegen.cpp
index 78b42e3..1f49d93 100644
--- a/clang/test/OpenMP/teams_distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_private_codegen.cpp
@@ -405,9 +405,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -931,9 +930,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
index 2d3fccd..9e6825d 100644
--- a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
@@ -556,9 +556,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1372,9 +1371,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1919,9 +1917,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2241,9 +2238,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
index ec95ab5..3f01eab 100644
--- a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
@@ -519,9 +519,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -846,9 +845,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -1229,9 +1227,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1554,9 +1551,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -1940,9 +1936,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2105,9 +2100,8 @@ int main() {
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK13-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK13-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK13-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK13-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK13-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK13-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
@@ -2341,9 +2335,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
@@ -2504,9 +2497,8 @@ int main() {
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK15-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK15-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK15-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK15-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK15-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
diff --git a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
index c839268..4c87b2d 100644
--- a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
@@ -413,9 +413,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -953,9 +952,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
@@ -1381,9 +1379,8 @@ int main() {
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK5-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK5-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK5-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK5-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK5-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK5-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -1757,9 +1754,8 @@ int main() {
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK7-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK7-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK7-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK7-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK7-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK7-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
index 649fae9..304bd5d 100644
--- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -283,9 +283,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], float 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float 2.000000e+00)
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[VAR]], float 3.000000e+00)
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
@@ -628,9 +627,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], i32 signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 signext 2)
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 signext 3)
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
@@ -1077,9 +1075,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], float 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float 2.000000e+00)
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr nonnull align 4 dereferenceable(4) [[VAR]], float 3.000000e+00)
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
@@ -1422,9 +1419,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[S_ARR]], i32 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 2)
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr nonnull align 4 dereferenceable(4) [[VAR]], i32 3)
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 128
diff --git a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
index e955db1..f44acf2 100644
--- a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
@@ -425,9 +425,8 @@ int main() {
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
@@ -951,9 +950,8 @@ int main() {
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK3-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
diff --git a/clang/test/OpenMP/teams_private_codegen.cpp b/clang/test/OpenMP/teams_private_codegen.cpp
index 1d0b243..81c0ea7 100644
--- a/clang/test/OpenMP/teams_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_private_codegen.cpp
@@ -551,9 +551,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
@@ -718,9 +717,8 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK9-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
-// CHECK9-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
-// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
-// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef signext 1)
+// CHECK9-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef signext 3)
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
@@ -1141,9 +1139,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 4
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const.main.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], float noundef 1.000000e+00)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], float noundef 1.000000e+00)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
@@ -1308,9 +1305,8 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN3SSTIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[SST]])
 // CHECK11-NEXT:    store i32 0, ptr [[T_VAR]], align 128
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC]], ptr align 128 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
-// CHECK11-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
-// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
-// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[S_ARR]], i32 noundef 1)
+// CHECK11-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/threadprivate_codegen.cpp b/clang/test/OpenMP/threadprivate_codegen.cpp
index 5087451..2ee3280 100644
--- a/clang/test/OpenMP/threadprivate_codegen.cpp
+++ b/clang/test/OpenMP/threadprivate_codegen.cpp
@@ -1028,46 +1028,43 @@ int foobar() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ARRAYINIT_ENDOFINIT:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[ARRAYINIT_ENDOFINIT2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[ARRAYINIT_ENDOFINIT1:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[ARRAYINIT_ENDOFINIT9:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[ARRAYINIT_ENDOFINIT7:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], ptr [[TMP1]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_BEGIN]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_BEGIN1]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8
-// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN1]], i32 noundef 1)
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8
+// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]], i32 noundef 1)
 // CHECK1-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
 // CHECK1:       invoke.cont:
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[ARRAYINIT_BEGIN1]], i64 1
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP1]], i64 1
+// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8
 // CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
-// CHECK1-NEXT:            to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD]]
-// CHECK1:       invoke.cont3:
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT]], i64 1
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT4]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8
-// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT4]], i32 noundef 3)
-// CHECK1-NEXT:            to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]]
-// CHECK1:       invoke.cont5:
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT7:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 1
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT7]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
-// CHECK1-NEXT:    [[ARRAYINIT_BEGIN8:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_ELEMENT7]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_BEGIN8]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN8]], i32 noundef 4)
-// CHECK1-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD10:%.*]]
+// CHECK1-NEXT:            to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD]]
+// CHECK1:       invoke.cont2:
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT3:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP1]], i64 2
+// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT3]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8
+// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT3]], i32 noundef 3)
+// CHECK1-NEXT:            to label [[INVOKE_CONT4:%.*]] unwind label [[LPAD]]
+// CHECK1:       invoke.cont4:
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT6:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 1
+// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
+// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT6]], i32 noundef 4)
+// CHECK1-NEXT:            to label [[INVOKE_CONT9:%.*]] unwind label [[LPAD8:%.*]]
+// CHECK1:       invoke.cont9:
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT10:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 1
+// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT10]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT10]], i32 noundef 5)
+// CHECK1-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD8]]
 // CHECK1:       invoke.cont11:
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_BEGIN8]], i64 1
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 5)
-// CHECK1-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD10]]
+// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 2
+// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 6)
+// CHECK1-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD8]]
 // CHECK1:       invoke.cont13:
-// CHECK1-NEXT:    [[ARRAYINIT_ELEMENT14:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT12]], i64 1
-// CHECK1-NEXT:    store ptr [[ARRAYINIT_ELEMENT14]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT14]], i32 noundef 6)
-// CHECK1-NEXT:            to label [[INVOKE_CONT15:%.*]] unwind label [[LPAD10]]
-// CHECK1:       invoke.cont15:
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    ret ptr [[TMP2]]
 // CHECK1:       lpad:
@@ -1077,55 +1074,55 @@ int foobar() {
 // CHECK1-NEXT:    store ptr [[TMP4]], ptr [[EXN_SLOT]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 1
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT2]], align 8
-// CHECK1-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN1]], [[TMP6]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT1]], align 8
+// CHECK1-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP6]]
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK1:       arraydestroy.body:
 // CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP6]], [[LPAD]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
 // CHECK1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]]
-// CHECK1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAYINIT_BEGIN1]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6]], label [[ARRAYDESTROY_BODY]]
-// CHECK1:       arraydestroy.done6:
+// CHECK1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[TMP1]]
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5]], label [[ARRAYDESTROY_BODY]]
+// CHECK1:       arraydestroy.done5:
 // CHECK1-NEXT:    br label [[EHCLEANUP:%.*]]
-// CHECK1:       lpad10:
+// CHECK1:       lpad8:
 // CHECK1-NEXT:    [[TMP7:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:            cleanup
 // CHECK1-NEXT:    [[TMP8:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 0
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[EXN_SLOT]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 1
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK1-NEXT:    [[ARRAYDESTROY_ISEMPTY16:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN8]], [[TMP10]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY16]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17:%.*]]
-// CHECK1:       arraydestroy.body17:
-// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP10]], [[LPAD10]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ]
-// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1
-// CHECK1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]]
-// CHECK1-NEXT:    [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAYINIT_BEGIN8]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_BODY17]]
-// CHECK1:       arraydestroy.done21:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK1-NEXT:    [[ARRAYDESTROY_ISEMPTY14:%.*]] = icmp eq ptr [[ARRAYINIT_ELEMENT6]], [[TMP10]]
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY14]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15:%.*]]
+// CHECK1:       arraydestroy.body15:
+// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP10]], [[LPAD8]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ]
+// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i64 -1
+// CHECK1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAYINIT_ELEMENT6]]
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_BODY15]]
+// CHECK1:       arraydestroy.done19:
 // CHECK1-NEXT:    br label [[EHCLEANUP]]
 // CHECK1:       ehcleanup:
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT]], align 8
-// CHECK1-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0
+// CHECK1-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 0, i64 0
 // CHECK1-NEXT:    [[PAD_ARRAYEND:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP11]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYDESTROY_ISEMPTY22:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY22]], label [[ARRAYDESTROY_DONE27:%.*]], label [[ARRAYDESTROY_BODY23:%.*]]
-// CHECK1:       arraydestroy.body23:
-// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST24:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT25:%.*]], [[ARRAYDESTROY_BODY23]] ]
-// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENT25]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST24]], i64 -1
-// CHECK1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT25]]) #[[ATTR3]]
-// CHECK1-NEXT:    [[ARRAYDESTROY_DONE26:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT25]], [[PAD_ARRAYBEGIN]]
-// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE26]], label [[ARRAYDESTROY_DONE27]], label [[ARRAYDESTROY_BODY23]]
-// CHECK1:       arraydestroy.done27:
+// CHECK1-NEXT:    [[ARRAYDESTROY_ISEMPTY20:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]]
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY20]], label [[ARRAYDESTROY_DONE25:%.*]], label [[ARRAYDESTROY_BODY21:%.*]]
+// CHECK1:       arraydestroy.body21:
+// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST22:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT23:%.*]], [[ARRAYDESTROY_BODY21]] ]
+// CHECK1-NEXT:    [[ARRAYDESTROY_ELEMENT23]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST22]], i64 -1
+// CHECK1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT23]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[ARRAYDESTROY_DONE24:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT23]], [[PAD_ARRAYBEGIN]]
+// CHECK1-NEXT:    br i1 [[ARRAYDESTROY_DONE24]], label [[ARRAYDESTROY_DONE25]], label [[ARRAYDESTROY_BODY21]]
+// CHECK1:       arraydestroy.done25:
 // CHECK1-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK1:       eh.resume:
 // CHECK1-NEXT:    [[EXN:%.*]] = load ptr, ptr [[EXN_SLOT]], align 8
 // CHECK1-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
 // CHECK1-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { ptr, i32 } poison, ptr [[EXN]], 0
-// CHECK1-NEXT:    [[LPAD_VAL28:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1
-// CHECK1-NEXT:    resume { ptr, i32 } [[LPAD_VAL28]]
+// CHECK1-NEXT:    [[LPAD_VAL26:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1
+// CHECK1-NEXT:    resume { ptr, i32 } [[LPAD_VAL26]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.__kmpc_global_dtor_..2
@@ -1880,46 +1877,43 @@ int foobar() {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[ARRAYINIT_ENDOFINIT:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    [[ARRAYINIT_ENDOFINIT2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[ARRAYINIT_ENDOFINIT1:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[ARRAYINIT_ENDOFINIT9:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[ARRAYINIT_ENDOFINIT7:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK2-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], ptr [[TMP1]], i64 0, i64 0
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_BEGIN]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
-// CHECK2-NEXT:    [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_BEGIN1]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8
-// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN1]], i32 noundef 1)
+// CHECK2-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
+// CHECK2-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8
+// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]], i32 noundef 1)
 // CHECK2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
 // CHECK2:       invoke.cont:
-// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[ARRAYINIT_BEGIN1]], i64 1
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8
+// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP1]], i64 1
+// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8
 // CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
-// CHECK2-NEXT:            to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD]]
-// CHECK2:       invoke.cont3:
-// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT]], i64 1
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT4]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8
-// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT4]], i32 noundef 3)
-// CHECK2-NEXT:            to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]]
-// CHECK2:       invoke.cont5:
-// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT7:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 1
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT7]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
-// CHECK2-NEXT:    [[ARRAYINIT_BEGIN8:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_ELEMENT7]], i64 0, i64 0
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_BEGIN8]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN8]], i32 noundef 4)
-// CHECK2-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD10:%.*]]
+// CHECK2-NEXT:            to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD]]
+// CHECK2:       invoke.cont2:
+// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT3:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP1]], i64 2
+// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT3]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8
+// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT3]], i32 noundef 3)
+// CHECK2-NEXT:            to label [[INVOKE_CONT4:%.*]] unwind label [[LPAD]]
+// CHECK2:       invoke.cont4:
+// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT6:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 1
+// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT]], align 8
+// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT6]], i32 noundef 4)
+// CHECK2-NEXT:            to label [[INVOKE_CONT9:%.*]] unwind label [[LPAD8:%.*]]
+// CHECK2:       invoke.cont9:
+// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT10:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 1
+// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT10]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT10]], i32 noundef 5)
+// CHECK2-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD8]]
 // CHECK2:       invoke.cont11:
-// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_BEGIN8]], i64 1
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 5)
-// CHECK2-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD10]]
+// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 2
+// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 6)
+// CHECK2-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD8]]
 // CHECK2:       invoke.cont13:
-// CHECK2-NEXT:    [[ARRAYINIT_ELEMENT14:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT12]], i64 1
-// CHECK2-NEXT:    store ptr [[ARRAYINIT_ELEMENT14]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT14]], i32 noundef 6)
-// CHECK2-NEXT:            to label [[INVOKE_CONT15:%.*]] unwind label [[LPAD10]]
-// CHECK2:       invoke.cont15:
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK2-NEXT:    ret ptr [[TMP2]]
 // CHECK2:       lpad:
@@ -1929,55 +1923,55 @@ int foobar() {
 // CHECK2-NEXT:    store ptr [[TMP4]], ptr [[EXN_SLOT]], align 8
 // CHECK2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 1
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT2]], align 8
-// CHECK2-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN1]], [[TMP6]]
-// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT1]], align 8
+// CHECK2-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP6]]
+// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK2:       arraydestroy.body:
 // CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP6]], [[LPAD]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
 // CHECK2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]]
-// CHECK2-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAYINIT_BEGIN1]]
-// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6]], label [[ARRAYDESTROY_BODY]]
-// CHECK2:       arraydestroy.done6:
+// CHECK2-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[TMP1]]
+// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5]], label [[ARRAYDESTROY_BODY]]
+// CHECK2:       arraydestroy.done5:
 // CHECK2-NEXT:    br label [[EHCLEANUP:%.*]]
-// CHECK2:       lpad10:
+// CHECK2:       lpad8:
 // CHECK2-NEXT:    [[TMP7:%.*]] = landingpad { ptr, i32 }
 // CHECK2-NEXT:            cleanup
 // CHECK2-NEXT:    [[TMP8:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 0
 // CHECK2-NEXT:    store ptr [[TMP8]], ptr [[EXN_SLOT]], align 8
 // CHECK2-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 1
 // CHECK2-NEXT:    store i32 [[TMP9]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT9]], align 8
-// CHECK2-NEXT:    [[ARRAYDESTROY_ISEMPTY16:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN8]], [[TMP10]]
-// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY16]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17:%.*]]
-// CHECK2:       arraydestroy.body17:
-// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP10]], [[LPAD10]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ]
-// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1
-// CHECK2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]]
-// CHECK2-NEXT:    [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAYINIT_BEGIN8]]
-// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_BODY17]]
-// CHECK2:       arraydestroy.done21:
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT7]], align 8
+// CHECK2-NEXT:    [[ARRAYDESTROY_ISEMPTY14:%.*]] = icmp eq ptr [[ARRAYINIT_ELEMENT6]], [[TMP10]]
+// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY14]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15:%.*]]
+// CHECK2:       arraydestroy.body15:
+// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP10]], [[LPAD8]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ]
+// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i64 -1
+// CHECK2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]]
+// CHECK2-NEXT:    [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAYINIT_ELEMENT6]]
+// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_BODY15]]
+// CHECK2:       arraydestroy.done19:
 // CHECK2-NEXT:    br label [[EHCLEANUP]]
 // CHECK2:       ehcleanup:
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT]], align 8
-// CHECK2-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0
+// CHECK2-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 0, i64 0
 // CHECK2-NEXT:    [[PAD_ARRAYEND:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP11]], i64 0, i64 0
-// CHECK2-NEXT:    [[ARRAYDESTROY_ISEMPTY22:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]]
-// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY22]], label [[ARRAYDESTROY_DONE27:%.*]], label [[ARRAYDESTROY_BODY23:%.*]]
-// CHECK2:       arraydestroy.body23:
-// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST24:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT25:%.*]], [[ARRAYDESTROY_BODY23]] ]
-// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENT25]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST24]], i64 -1
-// CHECK2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT25]]) #[[ATTR3]]
-// CHECK2-NEXT:    [[ARRAYDESTROY_DONE26:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT25]], [[PAD_ARRAYBEGIN]]
-// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_DONE26]], label [[ARRAYDESTROY_DONE27]], label [[ARRAYDESTROY_BODY23]]
-// CHECK2:       arraydestroy.done27:
+// CHECK2-NEXT:    [[ARRAYDESTROY_ISEMPTY20:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]]
+// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY20]], label [[ARRAYDESTROY_DONE25:%.*]], label [[ARRAYDESTROY_BODY21:%.*]]
+// CHECK2:       arraydestroy.body21:
+// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST22:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT23:%.*]], [[ARRAYDESTROY_BODY21]] ]
+// CHECK2-NEXT:    [[ARRAYDESTROY_ELEMENT23]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST22]], i64 -1
+// CHECK2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT23]]) #[[ATTR3]]
+// CHECK2-NEXT:    [[ARRAYDESTROY_DONE24:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT23]], [[PAD_ARRAYBEGIN]]
+// CHECK2-NEXT:    br i1 [[ARRAYDESTROY_DONE24]], label [[ARRAYDESTROY_DONE25]], label [[ARRAYDESTROY_BODY21]]
+// CHECK2:       arraydestroy.done25:
 // CHECK2-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK2:       eh.resume:
 // CHECK2-NEXT:    [[EXN:%.*]] = load ptr, ptr [[EXN_SLOT]], align 8
 // CHECK2-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
 // CHECK2-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { ptr, i32 } poison, ptr [[EXN]], 0
-// CHECK2-NEXT:    [[LPAD_VAL28:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1
-// CHECK2-NEXT:    resume { ptr, i32 } [[LPAD_VAL28]]
+// CHECK2-NEXT:    [[LPAD_VAL26:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1
+// CHECK2-NEXT:    resume { ptr, i32 } [[LPAD_VAL26]]
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@.__kmpc_global_dtor_..4
@@ -6505,47 +6499,44 @@ int foobar() {
 // DEBUG1-NEXT:  entry:
 // DEBUG1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // DEBUG1-NEXT:    [[ARRAYINIT_ENDOFINIT:%.*]] = alloca ptr, align 8
-// DEBUG1-NEXT:    [[ARRAYINIT_ENDOFINIT2:%.*]] = alloca ptr, align 8
+// DEBUG1-NEXT:    [[ARRAYINIT_ENDOFINIT1:%.*]] = alloca ptr, align 8
 // DEBUG1-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr, align 8
 // DEBUG1-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4
-// DEBUG1-NEXT:    [[ARRAYINIT_ENDOFINIT9:%.*]] = alloca ptr, align 8
+// DEBUG1-NEXT:    [[ARRAYINIT_ENDOFINIT7:%.*]] = alloca ptr, align 8
 // DEBUG1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // DEBUG1-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG144:![0-9]+]]
 // DEBUG1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !dbg [[DBG145:![0-9]+]]
-// DEBUG1-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], ptr [[TMP1]], i64 0, i64 0, !dbg [[DBG146:![0-9]+]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_BEGIN]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG146]]
-// DEBUG1-NEXT:    [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0, !dbg [[DBG147:![0-9]+]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_BEGIN1]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG147]]
-// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN1]], i32 noundef 1)
+// DEBUG1-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG146:![0-9]+]]
+// DEBUG1-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG147:![0-9]+]]
+// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]], i32 noundef 1)
 // DEBUG1-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]], !dbg [[DBG148:![0-9]+]]
 // DEBUG1:       invoke.cont:
-// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[ARRAYINIT_BEGIN1]], i64 1, !dbg [[DBG147]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG147]]
+// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP1]], i64 1, !dbg [[DBG147]]
+// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG147]]
 // DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
-// DEBUG1-NEXT:            to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD]], !dbg [[DBG149:![0-9]+]]
-// DEBUG1:       invoke.cont3:
-// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT]], i64 1, !dbg [[DBG147]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT4]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG147]]
-// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT4]], i32 noundef 3)
-// DEBUG1-NEXT:            to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]], !dbg [[DBG150:![0-9]+]]
-// DEBUG1:       invoke.cont5:
-// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT7:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 1, !dbg [[DBG146]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT7]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG146]]
-// DEBUG1-NEXT:    [[ARRAYINIT_BEGIN8:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_ELEMENT7]], i64 0, i64 0, !dbg [[DBG151:![0-9]+]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_BEGIN8]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG151]]
-// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN8]], i32 noundef 4)
-// DEBUG1-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD10:%.*]], !dbg [[DBG152:![0-9]+]]
+// DEBUG1-NEXT:            to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD]], !dbg [[DBG149:![0-9]+]]
+// DEBUG1:       invoke.cont2:
+// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT3:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP1]], i64 2, !dbg [[DBG147]]
+// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT3]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG147]]
+// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT3]], i32 noundef 3)
+// DEBUG1-NEXT:            to label [[INVOKE_CONT4:%.*]] unwind label [[LPAD]], !dbg [[DBG150:![0-9]+]]
+// DEBUG1:       invoke.cont4:
+// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT6:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 1, !dbg [[DBG146]]
+// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG146]]
+// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG151:![0-9]+]]
+// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT6]], i32 noundef 4)
+// DEBUG1-NEXT:            to label [[INVOKE_CONT9:%.*]] unwind label [[LPAD8:%.*]], !dbg [[DBG152:![0-9]+]]
+// DEBUG1:       invoke.cont9:
+// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT10:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 1, !dbg [[DBG151]]
+// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT10]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG151]]
+// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT10]], i32 noundef 5)
+// DEBUG1-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD8]], !dbg [[DBG153:![0-9]+]]
 // DEBUG1:       invoke.cont11:
-// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_BEGIN8]], i64 1, !dbg [[DBG151]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG151]]
-// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 5)
-// DEBUG1-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD10]], !dbg [[DBG153:![0-9]+]]
+// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 2, !dbg [[DBG151]]
+// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG151]]
+// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 6)
+// DEBUG1-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD8]], !dbg [[DBG154:![0-9]+]]
 // DEBUG1:       invoke.cont13:
-// DEBUG1-NEXT:    [[ARRAYINIT_ELEMENT14:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT12]], i64 1, !dbg [[DBG151]]
-// DEBUG1-NEXT:    store ptr [[ARRAYINIT_ELEMENT14]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG151]]
-// DEBUG1-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT14]], i32 noundef 6)
-// DEBUG1-NEXT:            to label [[INVOKE_CONT15:%.*]] unwind label [[LPAD10]], !dbg [[DBG154:![0-9]+]]
-// DEBUG1:       invoke.cont15:
 // DEBUG1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !dbg [[DBG145]]
 // DEBUG1-NEXT:    ret ptr [[TMP2]], !dbg [[DBG145]]
 // DEBUG1:       lpad:
@@ -6555,55 +6546,55 @@ int foobar() {
 // DEBUG1-NEXT:    store ptr [[TMP4]], ptr [[EXN_SLOT]], align 8, !dbg [[DBG144]]
 // DEBUG1-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 1, !dbg [[DBG144]]
 // DEBUG1-NEXT:    store i32 [[TMP5]], ptr [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG144]]
-// DEBUG1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG147]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN1]], [[TMP6]], !dbg [[DBG147]]
-// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG147]]
+// DEBUG1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG147]]
+// DEBUG1-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP6]], !dbg [[DBG147]]
+// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG147]]
 // DEBUG1:       arraydestroy.body:
 // DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP6]], [[LPAD]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ], !dbg [[DBG147]]
 // DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1, !dbg [[DBG147]]
 // DEBUG1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]], !dbg [[DBG147]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAYINIT_BEGIN1]], !dbg [[DBG147]]
-// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG147]]
-// DEBUG1:       arraydestroy.done6:
+// DEBUG1-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[TMP1]], !dbg [[DBG147]]
+// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG147]]
+// DEBUG1:       arraydestroy.done5:
 // DEBUG1-NEXT:    br label [[EHCLEANUP:%.*]], !dbg [[DBG147]]
-// DEBUG1:       lpad10:
+// DEBUG1:       lpad8:
 // DEBUG1-NEXT:    [[TMP7:%.*]] = landingpad { ptr, i32 }
 // DEBUG1-NEXT:            cleanup, !dbg [[DBG144]]
 // DEBUG1-NEXT:    [[TMP8:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 0, !dbg [[DBG144]]
 // DEBUG1-NEXT:    store ptr [[TMP8]], ptr [[EXN_SLOT]], align 8, !dbg [[DBG144]]
 // DEBUG1-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 1, !dbg [[DBG144]]
 // DEBUG1-NEXT:    store i32 [[TMP9]], ptr [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG144]]
-// DEBUG1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG151]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_ISEMPTY16:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN8]], [[TMP10]], !dbg [[DBG151]]
-// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY16]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17:%.*]], !dbg [[DBG151]]
-// DEBUG1:       arraydestroy.body17:
-// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP10]], [[LPAD10]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ], !dbg [[DBG151]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1, !dbg [[DBG151]]
-// DEBUG1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]], !dbg [[DBG151]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAYINIT_BEGIN8]], !dbg [[DBG151]]
-// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_BODY17]], !dbg [[DBG151]]
-// DEBUG1:       arraydestroy.done21:
+// DEBUG1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG151]]
+// DEBUG1-NEXT:    [[ARRAYDESTROY_ISEMPTY14:%.*]] = icmp eq ptr [[ARRAYINIT_ELEMENT6]], [[TMP10]], !dbg [[DBG151]]
+// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY14]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15:%.*]], !dbg [[DBG151]]
+// DEBUG1:       arraydestroy.body15:
+// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP10]], [[LPAD8]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ], !dbg [[DBG151]]
+// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i64 -1, !dbg [[DBG151]]
+// DEBUG1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]], !dbg [[DBG151]]
+// DEBUG1-NEXT:    [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAYINIT_ELEMENT6]], !dbg [[DBG151]]
+// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_BODY15]], !dbg [[DBG151]]
+// DEBUG1:       arraydestroy.done19:
 // DEBUG1-NEXT:    br label [[EHCLEANUP]], !dbg [[DBG151]]
 // DEBUG1:       ehcleanup:
 // DEBUG1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG146]]
-// DEBUG1-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0, !dbg [[DBG146]]
+// DEBUG1-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 0, i64 0, !dbg [[DBG146]]
 // DEBUG1-NEXT:    [[PAD_ARRAYEND:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP11]], i64 0, i64 0, !dbg [[DBG146]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_ISEMPTY22:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]], !dbg [[DBG146]]
-// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY22]], label [[ARRAYDESTROY_DONE27:%.*]], label [[ARRAYDESTROY_BODY23:%.*]], !dbg [[DBG146]]
-// DEBUG1:       arraydestroy.body23:
-// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST24:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT25:%.*]], [[ARRAYDESTROY_BODY23]] ], !dbg [[DBG146]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENT25]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST24]], i64 -1, !dbg [[DBG146]]
-// DEBUG1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT25]]) #[[ATTR4]], !dbg [[DBG146]]
-// DEBUG1-NEXT:    [[ARRAYDESTROY_DONE26:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT25]], [[PAD_ARRAYBEGIN]], !dbg [[DBG146]]
-// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_DONE26]], label [[ARRAYDESTROY_DONE27]], label [[ARRAYDESTROY_BODY23]], !dbg [[DBG146]]
-// DEBUG1:       arraydestroy.done27:
+// DEBUG1-NEXT:    [[ARRAYDESTROY_ISEMPTY20:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]], !dbg [[DBG146]]
+// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY20]], label [[ARRAYDESTROY_DONE25:%.*]], label [[ARRAYDESTROY_BODY21:%.*]], !dbg [[DBG146]]
+// DEBUG1:       arraydestroy.body21:
+// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENTPAST22:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT23:%.*]], [[ARRAYDESTROY_BODY21]] ], !dbg [[DBG146]]
+// DEBUG1-NEXT:    [[ARRAYDESTROY_ELEMENT23]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST22]], i64 -1, !dbg [[DBG146]]
+// DEBUG1-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT23]]) #[[ATTR4]], !dbg [[DBG146]]
+// DEBUG1-NEXT:    [[ARRAYDESTROY_DONE24:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT23]], [[PAD_ARRAYBEGIN]], !dbg [[DBG146]]
+// DEBUG1-NEXT:    br i1 [[ARRAYDESTROY_DONE24]], label [[ARRAYDESTROY_DONE25]], label [[ARRAYDESTROY_BODY21]], !dbg [[DBG146]]
+// DEBUG1:       arraydestroy.done25:
 // DEBUG1-NEXT:    br label [[EH_RESUME:%.*]], !dbg [[DBG146]]
 // DEBUG1:       eh.resume:
 // DEBUG1-NEXT:    [[EXN:%.*]] = load ptr, ptr [[EXN_SLOT]], align 8, !dbg [[DBG146]]
 // DEBUG1-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG146]]
 // DEBUG1-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { ptr, i32 } poison, ptr [[EXN]], 0, !dbg [[DBG146]]
-// DEBUG1-NEXT:    [[LPAD_VAL28:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1, !dbg [[DBG146]]
-// DEBUG1-NEXT:    resume { ptr, i32 } [[LPAD_VAL28]], !dbg [[DBG146]]
+// DEBUG1-NEXT:    [[LPAD_VAL26:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1, !dbg [[DBG146]]
+// DEBUG1-NEXT:    resume { ptr, i32 } [[LPAD_VAL26]], !dbg [[DBG146]]
 //
 //
 // DEBUG1-LABEL: define {{[^@]+}}@.__kmpc_global_dtor_..2
@@ -7375,47 +7366,44 @@ int foobar() {
 // DEBUG2-NEXT:  entry:
 // DEBUG2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // DEBUG2-NEXT:    [[ARRAYINIT_ENDOFINIT:%.*]] = alloca ptr, align 8
-// DEBUG2-NEXT:    [[ARRAYINIT_ENDOFINIT2:%.*]] = alloca ptr, align 8
+// DEBUG2-NEXT:    [[ARRAYINIT_ENDOFINIT1:%.*]] = alloca ptr, align 8
 // DEBUG2-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr, align 8
 // DEBUG2-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4
-// DEBUG2-NEXT:    [[ARRAYINIT_ENDOFINIT9:%.*]] = alloca ptr, align 8
+// DEBUG2-NEXT:    [[ARRAYINIT_ENDOFINIT7:%.*]] = alloca ptr, align 8
 // DEBUG2-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // DEBUG2-NEXT:    tail call void @llvm.dbg.declare(metadata ptr [[DOTADDR]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]]
 // DEBUG2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !dbg [[DBG181:![0-9]+]]
-// DEBUG2-NEXT:    [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x [3 x %struct.S1]], ptr [[TMP1]], i64 0, i64 0, !dbg [[DBG182:![0-9]+]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_BEGIN]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG182]]
-// DEBUG2-NEXT:    [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0, !dbg [[DBG183:![0-9]+]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_BEGIN1]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG183]]
-// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN1]], i32 noundef 1)
+// DEBUG2-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG182:![0-9]+]]
+// DEBUG2-NEXT:    store ptr [[TMP1]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG183:![0-9]+]]
+// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP1]], i32 noundef 1)
 // DEBUG2-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]], !dbg [[DBG184:![0-9]+]]
 // DEBUG2:       invoke.cont:
-// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[ARRAYINIT_BEGIN1]], i64 1, !dbg [[DBG183]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG183]]
+// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP1]], i64 1, !dbg [[DBG183]]
+// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG183]]
 // DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
-// DEBUG2-NEXT:            to label [[INVOKE_CONT3:%.*]] unwind label [[LPAD]], !dbg [[DBG185:![0-9]+]]
-// DEBUG2:       invoke.cont3:
-// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT]], i64 1, !dbg [[DBG183]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT4]], ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG183]]
-// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT4]], i32 noundef 3)
-// DEBUG2-NEXT:            to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]], !dbg [[DBG186:![0-9]+]]
-// DEBUG2:       invoke.cont5:
-// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT7:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 1, !dbg [[DBG182]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT7]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG182]]
-// DEBUG2-NEXT:    [[ARRAYINIT_BEGIN8:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_ELEMENT7]], i64 0, i64 0, !dbg [[DBG187:![0-9]+]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_BEGIN8]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG187]]
-// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN8]], i32 noundef 4)
-// DEBUG2-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD10:%.*]], !dbg [[DBG188:![0-9]+]]
+// DEBUG2-NEXT:            to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD]], !dbg [[DBG185:![0-9]+]]
+// DEBUG2:       invoke.cont2:
+// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT3:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP1]], i64 2, !dbg [[DBG183]]
+// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT3]], ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG183]]
+// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT3]], i32 noundef 3)
+// DEBUG2-NEXT:            to label [[INVOKE_CONT4:%.*]] unwind label [[LPAD]], !dbg [[DBG186:![0-9]+]]
+// DEBUG2:       invoke.cont4:
+// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT6:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 1, !dbg [[DBG182]]
+// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG182]]
+// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT6]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG187:![0-9]+]]
+// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT6]], i32 noundef 4)
+// DEBUG2-NEXT:            to label [[INVOKE_CONT9:%.*]] unwind label [[LPAD8:%.*]], !dbg [[DBG188:![0-9]+]]
+// DEBUG2:       invoke.cont9:
+// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT10:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 1, !dbg [[DBG187]]
+// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT10]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG187]]
+// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT10]], i32 noundef 5)
+// DEBUG2-NEXT:            to label [[INVOKE_CONT11:%.*]] unwind label [[LPAD8]], !dbg [[DBG189:![0-9]+]]
 // DEBUG2:       invoke.cont11:
-// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_BEGIN8]], i64 1, !dbg [[DBG187]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG187]]
-// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 5)
-// DEBUG2-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD10]], !dbg [[DBG189:![0-9]+]]
+// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT12:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT6]], i64 2, !dbg [[DBG187]]
+// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT12]], ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG187]]
+// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT12]], i32 noundef 6)
+// DEBUG2-NEXT:            to label [[INVOKE_CONT13:%.*]] unwind label [[LPAD8]], !dbg [[DBG190:![0-9]+]]
 // DEBUG2:       invoke.cont13:
-// DEBUG2-NEXT:    [[ARRAYINIT_ELEMENT14:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYINIT_ELEMENT12]], i64 1, !dbg [[DBG187]]
-// DEBUG2-NEXT:    store ptr [[ARRAYINIT_ELEMENT14]], ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG187]]
-// DEBUG2-NEXT:    invoke void @_ZN2S1C1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT14]], i32 noundef 6)
-// DEBUG2-NEXT:            to label [[INVOKE_CONT15:%.*]] unwind label [[LPAD10]], !dbg [[DBG190:![0-9]+]]
-// DEBUG2:       invoke.cont15:
 // DEBUG2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !dbg [[DBG181]]
 // DEBUG2-NEXT:    ret ptr [[TMP2]], !dbg [[DBG181]]
 // DEBUG2:       lpad:
@@ -7425,55 +7413,55 @@ int foobar() {
 // DEBUG2-NEXT:    store ptr [[TMP4]], ptr [[EXN_SLOT]], align 8, !dbg [[DBG180]]
 // DEBUG2-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 1, !dbg [[DBG180]]
 // DEBUG2-NEXT:    store i32 [[TMP5]], ptr [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG180]]
-// DEBUG2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT2]], align 8, !dbg [[DBG183]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN1]], [[TMP6]], !dbg [[DBG183]]
-// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG183]]
+// DEBUG2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT1]], align 8, !dbg [[DBG183]]
+// DEBUG2-NEXT:    [[ARRAYDESTROY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP6]], !dbg [[DBG183]]
+// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY:%.*]], !dbg [[DBG183]]
 // DEBUG2:       arraydestroy.body:
 // DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP6]], [[LPAD]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ], !dbg [[DBG183]]
 // DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1, !dbg [[DBG183]]
 // DEBUG2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]], !dbg [[DBG183]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAYINIT_BEGIN1]], !dbg [[DBG183]]
-// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG183]]
-// DEBUG2:       arraydestroy.done6:
+// DEBUG2-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[TMP1]], !dbg [[DBG183]]
+// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5]], label [[ARRAYDESTROY_BODY]], !dbg [[DBG183]]
+// DEBUG2:       arraydestroy.done5:
 // DEBUG2-NEXT:    br label [[EHCLEANUP:%.*]], !dbg [[DBG183]]
-// DEBUG2:       lpad10:
+// DEBUG2:       lpad8:
 // DEBUG2-NEXT:    [[TMP7:%.*]] = landingpad { ptr, i32 }
 // DEBUG2-NEXT:            cleanup, !dbg [[DBG180]]
 // DEBUG2-NEXT:    [[TMP8:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 0, !dbg [[DBG180]]
 // DEBUG2-NEXT:    store ptr [[TMP8]], ptr [[EXN_SLOT]], align 8, !dbg [[DBG180]]
 // DEBUG2-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, i32 } [[TMP7]], 1, !dbg [[DBG180]]
 // DEBUG2-NEXT:    store i32 [[TMP9]], ptr [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG180]]
-// DEBUG2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT9]], align 8, !dbg [[DBG187]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_ISEMPTY16:%.*]] = icmp eq ptr [[ARRAYINIT_BEGIN8]], [[TMP10]], !dbg [[DBG187]]
-// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY16]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17:%.*]], !dbg [[DBG187]]
-// DEBUG2:       arraydestroy.body17:
-// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP10]], [[LPAD10]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ], !dbg [[DBG187]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1, !dbg [[DBG187]]
-// DEBUG2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]], !dbg [[DBG187]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAYINIT_BEGIN8]], !dbg [[DBG187]]
-// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_BODY17]], !dbg [[DBG187]]
-// DEBUG2:       arraydestroy.done21:
+// DEBUG2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT7]], align 8, !dbg [[DBG187]]
+// DEBUG2-NEXT:    [[ARRAYDESTROY_ISEMPTY14:%.*]] = icmp eq ptr [[ARRAYINIT_ELEMENT6]], [[TMP10]], !dbg [[DBG187]]
+// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY14]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15:%.*]], !dbg [[DBG187]]
+// DEBUG2:       arraydestroy.body15:
+// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP10]], [[LPAD8]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ], !dbg [[DBG187]]
+// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i64 -1, !dbg [[DBG187]]
+// DEBUG2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]], !dbg [[DBG187]]
+// DEBUG2-NEXT:    [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAYINIT_ELEMENT6]], !dbg [[DBG187]]
+// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_BODY15]], !dbg [[DBG187]]
+// DEBUG2:       arraydestroy.done19:
 // DEBUG2-NEXT:    br label [[EHCLEANUP]], !dbg [[DBG187]]
 // DEBUG2:       ehcleanup:
 // DEBUG2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[ARRAYINIT_ENDOFINIT]], align 8, !dbg [[DBG182]]
-// DEBUG2-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[ARRAYINIT_BEGIN]], i64 0, i64 0, !dbg [[DBG182]]
+// DEBUG2-NEXT:    [[PAD_ARRAYBEGIN:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP1]], i64 0, i64 0, !dbg [[DBG182]]
 // DEBUG2-NEXT:    [[PAD_ARRAYEND:%.*]] = getelementptr inbounds [3 x %struct.S1], ptr [[TMP11]], i64 0, i64 0, !dbg [[DBG182]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_ISEMPTY22:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]], !dbg [[DBG182]]
-// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY22]], label [[ARRAYDESTROY_DONE27:%.*]], label [[ARRAYDESTROY_BODY23:%.*]], !dbg [[DBG182]]
-// DEBUG2:       arraydestroy.body23:
-// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST24:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT25:%.*]], [[ARRAYDESTROY_BODY23]] ], !dbg [[DBG182]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENT25]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST24]], i64 -1, !dbg [[DBG182]]
-// DEBUG2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT25]]) #[[ATTR4]], !dbg [[DBG182]]
-// DEBUG2-NEXT:    [[ARRAYDESTROY_DONE26:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT25]], [[PAD_ARRAYBEGIN]], !dbg [[DBG182]]
-// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_DONE26]], label [[ARRAYDESTROY_DONE27]], label [[ARRAYDESTROY_BODY23]], !dbg [[DBG182]]
-// DEBUG2:       arraydestroy.done27:
+// DEBUG2-NEXT:    [[ARRAYDESTROY_ISEMPTY20:%.*]] = icmp eq ptr [[PAD_ARRAYBEGIN]], [[PAD_ARRAYEND]], !dbg [[DBG182]]
+// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_ISEMPTY20]], label [[ARRAYDESTROY_DONE25:%.*]], label [[ARRAYDESTROY_BODY21:%.*]], !dbg [[DBG182]]
+// DEBUG2:       arraydestroy.body21:
+// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENTPAST22:%.*]] = phi ptr [ [[PAD_ARRAYEND]], [[EHCLEANUP]] ], [ [[ARRAYDESTROY_ELEMENT23:%.*]], [[ARRAYDESTROY_BODY21]] ], !dbg [[DBG182]]
+// DEBUG2-NEXT:    [[ARRAYDESTROY_ELEMENT23]] = getelementptr inbounds [[STRUCT_S1]], ptr [[ARRAYDESTROY_ELEMENTPAST22]], i64 -1, !dbg [[DBG182]]
+// DEBUG2-NEXT:    call void @_ZN2S1D1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT23]]) #[[ATTR4]], !dbg [[DBG182]]
+// DEBUG2-NEXT:    [[ARRAYDESTROY_DONE24:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT23]], [[PAD_ARRAYBEGIN]], !dbg [[DBG182]]
+// DEBUG2-NEXT:    br i1 [[ARRAYDESTROY_DONE24]], label [[ARRAYDESTROY_DONE25]], label [[ARRAYDESTROY_BODY21]], !dbg [[DBG182]]
+// DEBUG2:       arraydestroy.done25:
 // DEBUG2-NEXT:    br label [[EH_RESUME:%.*]], !dbg [[DBG182]]
 // DEBUG2:       eh.resume:
 // DEBUG2-NEXT:    [[EXN:%.*]] = load ptr, ptr [[EXN_SLOT]], align 8, !dbg [[DBG182]]
 // DEBUG2-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4, !dbg [[DBG182]]
 // DEBUG2-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { ptr, i32 } poison, ptr [[EXN]], 0, !dbg [[DBG182]]
-// DEBUG2-NEXT:    [[LPAD_VAL28:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1, !dbg [[DBG182]]
-// DEBUG2-NEXT:    resume { ptr, i32 } [[LPAD_VAL28]], !dbg [[DBG182]]
+// DEBUG2-NEXT:    [[LPAD_VAL26:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1, !dbg [[DBG182]]
+// DEBUG2-NEXT:    resume { ptr, i32 } [[LPAD_VAL26]], !dbg [[DBG182]]
 //
 //
 // DEBUG2-LABEL: define {{[^@]+}}@.__kmpc_global_dtor_..4
diff --git a/clang/test/PCH/cxx_paren_init.cpp b/clang/test/PCH/cxx_paren_init.cpp
index 9731ea7..298e316 100644
--- a/clang/test/PCH/cxx_paren_init.cpp
+++ b/clang/test/PCH/cxx_paren_init.cpp
@@ -16,14 +16,13 @@ U u = baz(3);
 // CHECK-NEXT: [[ARR:%.*]] = alloca [4 x i32], align 16
 // CHECK-NEXT: store i32 [[A:%.*]], ptr [[I_ADDR]], align 4
 // CHECK-NEXT: store i32 [[B:%.*]], ptr [[J_ADDR]], align 4
-// CHECK-NEXT: [[ARRINIT_BEGIN:%.*]] = getelementptr inbounds [4 x i32], ptr [[ARR]], i64 0, i64 0
 // CHECK-NEXT: [[TMP_0:%.*]] = load i32, ptr [[I_ADDR]], align 4
-// CHECK-NEXT: store i32 [[TMP_0]], ptr [[ARRINIT_BEGIN]], align 4
-// CHECK-NEXT: [[ARRINIT_ELEM:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_BEGIN]], i64 1
+// CHECK-NEXT: store i32 [[TMP_0]], ptr [[ARR]], align 4
+// CHECK-NEXT: [[ARRINIT_ELEM:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 1
 // CHECK-NEXT: [[TMP_1:%.*]] = load i32, ptr [[J_ADDR]], align 4
 // CHECK-NEXT: store i32 [[TMP_1]], ptr [[ARRINIT_ELEM]], align 4
-// CHECK-NEXT: [[ARRINIT_START:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_ELEM]], i64 1
-// CHECK-NEXT: [[ARRINIT_END:%.*]] = getelementptr inbounds i32, ptr [[ARRINIT_BEGIN]], i64 4
+// CHECK-NEXT: [[ARRINIT_START:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 2
+// CHECK-NEXT: [[ARRINIT_END:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 4
 // CHECK-NEXT: br label [[ARRINIT_BODY:%.*]]
 // CHECK: [[ARRINIT_CUR:%.*]] = phi ptr [ [[ARRINIT_START]], %entry ], [ [[ARRINIT_NEXT:%.*]], [[ARRINIT_BODY]] ]
 // CHECK-NEXT: store i32 0, ptr [[ARRINIT_CUR]], align 4
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 82304a1..c707972 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -335,7 +335,7 @@
 // CHECK-MCPU-CARMEL: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s
-// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.5a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon"
+// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+neon"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64_32 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64_32 %s
 // CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
@@ -671,10 +671,15 @@
 // CHECK-V83-OR-LATER: __ARM_FEATURE_JCVT 1
 // CHECK-V83-OR-LATER: __ARM_FEATURE_PAUTH 1
 // CHECK-V81-OR-LATER: __ARM_FEATURE_QRDMX 1
+// CHECK-BEFORE-V85-NOT: __ARM_FEATURE_BTI 1
 // CHECK-BEFORE-V83-NOT: __ARM_FEATURE_COMPLEX 1
 // CHECK-BEFORE-V83-NOT: __ARM_FEATURE_JCVT 1
 // CHECK-BEFORE-V85-NOT: __ARM_FEATURE_FRINT 1
 
+// RUN: %clang -target aarch64 -mcpu=apple-a14 -x c -E -dM %s -o - | FileCheck --check-prefix=APPLE-A14-M1 %s
+// RUN: %clang -target aarch64 -mcpu=apple-m1 -x c -E -dM %s -o - | FileCheck --check-prefix=APPLE-A14-M1 %s
+// APPLE-A14-M1-NOT: __ARM_FEATURE_BTI 1
+
 // RUN: %clang --target=aarch64 -march=armv8.2-a+rcpc -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-RCPC %s
 // CHECK-RCPC: __ARM_FEATURE_RCPC 1
 
diff --git a/clang/test/Sema/aarch64-neon-target.c b/clang/test/Sema/aarch64-neon-target.c
index fa45fff..642afdd 100644
--- a/clang/test/Sema/aarch64-neon-target.c
+++ b/clang/test/Sema/aarch64-neon-target.c
@@ -69,8 +69,8 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t
   vrnd_f16(v4f16); // expected-error {{always_inline function 'vrnd_f16' requires target feature 'fullfp16'}}
   vmaxnm_f16(v4f16, v4f16); // expected-error {{always_inline function 'vmaxnm_f16' requires target feature 'fullfp16'}}
   vrndi_f16(v4f16); // expected-error {{always_inline function 'vrndi_f16' requires target feature 'fullfp16'}}
-  // fp16fml
-  vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'fp16fml'}}
+  // fp16fml depends on fp-armv8
+  vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'fp-armv8'}}
   // i8mm
   vmmlaq_s32(v4i32, v8i16, v8i16); // expected-error {{always_inline function 'vmmlaq_s32' requires target feature 'i8mm'}}
   vusdot_laneq_s32(v2i32, v8i8, v8i16, 0); // expected-error {{always_inline function 'vusdot_s32' requires target feature 'i8mm'}}
diff --git a/clang/test/Sema/constexpr-void-cast.c b/clang/test/Sema/constexpr-void-cast.c
index 91e4027..2ffc59f 100644
--- a/clang/test/Sema/constexpr-void-cast.c
+++ b/clang/test/Sema/constexpr-void-cast.c
@@ -1,8 +1,12 @@
 // RUN: %clang_cc1 -x c -fsyntax-only %s -verify=c -std=c11
 // RUN: %clang_cc1 -x c -fsyntax-only %s -pedantic -verify=c-pedantic -std=c11
+// RUN: %clang_cc1 -x c -fsyntax-only %s -verify=c -std=c11 -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -x c -fsyntax-only %s -pedantic -verify=c-pedantic -std=c11 -fexperimental-new-constant-interpreter
 //
 // RUN: %clang_cc1 -x c++ -fsyntax-only %s -verify=cxx
 // RUN: %clang_cc1 -x c++ -fsyntax-only %s -pedantic -verify=cxx-pedantic
+// RUN: %clang_cc1 -x c++ -fsyntax-only %s -verify=cxx -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -x c++ -fsyntax-only %s -pedantic -verify=cxx-pedantic -fexperimental-new-constant-interpreter
 
 // c-no-diagnostics
 // cxx-no-diagnostics
diff --git a/clang/test/SemaCUDA/function-redclare.cu b/clang/test/SemaCUDA/function-redclare.cu
new file mode 100644
index 0000000..7cd9bad
--- /dev/null
+++ b/clang/test/SemaCUDA/function-redclare.cu
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only \
+// RUN:   -isystem %S/Inputs -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only \
+// RUN:   -isystem %S/Inputs -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only \
+// RUN:   -isystem %S/Inputs -verify=redecl -Wnvcc-compat %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only \
+// RUN:   -isystem %S/Inputs -fcuda-is-device -Wnvcc-compat -verify=redecl %s
+
+// expected-no-diagnostics
+#include "cuda.h"
+
+__device__ void f(); // redecl-note {{previous declaration is here}}
+
+void f() {} // redecl-warning {{target-attribute based function overloads are not supported by NVCC and will be treated as a function redeclaration:new declaration is __host__ function, old declaration is __device__ function}}
+
+void g(); // redecl-note {{previous declaration is here}}
+
+__device__ void g() {} // redecl-warning {{target-attribute based function overloads are not supported by NVCC and will be treated as a function redeclaration:new declaration is __device__ function, old declaration is __host__ function}}
diff --git a/clang/test/SemaCXX/complex-folding.cpp b/clang/test/SemaCXX/complex-folding.cpp
index 054f159..7bfd36f 100644
--- a/clang/test/SemaCXX/complex-folding.cpp
+++ b/clang/test/SemaCXX/complex-folding.cpp
@@ -59,41 +59,48 @@ static_assert((1.25 / (0.25 - 0.75j)) == (0.5 + 1.5j));
 
 // Test that infinities are preserved, don't turn into NaNs, and do form zeros
 // when the divisor.
+constexpr _Complex float InfC = {1.0, __builtin_inf()};
+constexpr _Complex float InfInf = __builtin_inf() + InfC;
+static_assert(__real__(InfInf) == __builtin_inf());
+static_assert(__imag__(InfInf) == __builtin_inf());
+static_assert(__builtin_isnan(__real__(InfInf * InfInf)));
+static_assert(__builtin_isinf_sign(__imag__(InfInf * InfInf)) == 1);
+
 static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) * 1.0)) == 1);
-static_assert(__builtin_isinf_sign(__imag__((1.0 + __builtin_inf() * 1.0j) * 1.0)) == 1);
+static_assert(__builtin_isinf_sign(__imag__((1.0 + InfC) * 1.0)) == 1);
 static_assert(__builtin_isinf_sign(__real__(1.0 * (__builtin_inf() + 1.0j))) == 1);
-static_assert(__builtin_isinf_sign(__imag__(1.0 * (1.0 + __builtin_inf() * 1.0j))) == 1);
-
+static_assert(__builtin_isinf_sign(__imag__(1.0 * (1.0 + InfC))) == 1);
 static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) * (1.0 + 1.0j))) == 1);
 static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) * (__builtin_inf() + 1.0j))) == 1);
 static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) * (__builtin_inf() + 1.0j))) == 1);
-
-static_assert(__builtin_isinf_sign(__real__((1.0 + __builtin_inf() * 1.0j) * (1.0 + 1.0j))) == -1);
-static_assert(__builtin_isinf_sign(__imag__((1.0 + __builtin_inf() * 1.0j) * (1.0 + 1.0j))) == 1);
-static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) * (1.0 + __builtin_inf() * 1.0j))) == -1);
-static_assert(__builtin_isinf_sign(__imag__((1.0 + 1.0j) * (1.0 + __builtin_inf() * 1.0j))) == 1);
-
-static_assert(__builtin_isinf_sign(__real__((1.0 + __builtin_inf() * 1.0j) * (1.0 + __builtin_inf() * 1.0j))) == -1);
-static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + __builtin_inf() * 1.0j) * (__builtin_inf() + __builtin_inf() * 1.0j))) == -1);
-
+static_assert(__builtin_isinf_sign(__real__((1.0 + InfC) * (1.0 + 1.0j))) == -1);
+static_assert(__builtin_isinf_sign(__imag__((1.0 + InfC) * (1.0 + 1.0j))) == 1);
+static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) * (1.0 + InfC))) == -1);
+static_assert(__builtin_isinf_sign(__imag__((1.0 + 1.0j) * (1.0 + InfC))) == 1);
+static_assert(__builtin_isinf_sign(__real__((1.0 + InfC) * (1.0 + InfC))) == -1);
+static_assert(__builtin_isinf_sign(__real__(InfInf * InfInf)) == 0);
 static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) / (1.0 + 1.0j))) == 1);
-static_assert(__builtin_isinf_sign(__imag__(1.0 + (__builtin_inf() * 1.0j) / (1.0 + 1.0j))) == 1);
-static_assert(__builtin_isinf_sign(__imag__((__builtin_inf() + __builtin_inf() * 1.0j) / (1.0 + 1.0j))) == 1);
+static_assert(__builtin_isinf_sign(__imag__(1.0 + (InfC) / (1.0 + 1.0j))) == 1);
+static_assert(__builtin_isinf_sign(__imag__((InfInf) / (1.0 + 1.0j))) == 0);
 static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) / 1.0)) == 1);
-static_assert(__builtin_isinf_sign(__imag__(1.0 + (__builtin_inf() * 1.0j) / 1.0)) == 1);
-static_assert(__builtin_isinf_sign(__imag__((__builtin_inf() + __builtin_inf() * 1.0j) / 1.0)) == 1);
-
+static_assert(__builtin_isinf_sign(__imag__(1.0 + (InfC) / 1.0)) == 1);
+static_assert(__builtin_isinf_sign(__imag__((InfInf) / 1.0)) == 1);
 static_assert(((1.0 + 1.0j) / (__builtin_inf() + 1.0j)) == (0.0 + 0.0j));
-static_assert(((1.0 + 1.0j) / (1.0 + __builtin_inf() * 1.0j)) == (0.0 + 0.0j));
-static_assert(((1.0 + 1.0j) / (__builtin_inf() + __builtin_inf() * 1.0j)) == (0.0 + 0.0j));
+static_assert(((1.0 + 1.0j) / (1.0 + InfC)) == (0.0 + 0.0j));
+static_assert(((1.0 + 1.0j) / (InfInf)) == (0.0 + 0.0j));
 static_assert(((1.0 + 1.0j) / __builtin_inf()) == (0.0 + 0.0j));
-
+static_assert(1.0j / 0.0 == 1); // expected-error {{static assertion}} \
+                                // expected-note {{division by zero}}
 static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) / (0.0 + 0.0j))) == 1);
-static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) / 0.0)) == 1);
-
+static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) / 0.0)) == 1); // expected-error {{static assertion}} \
+                                                                        // expected-note {{division by zero}}
 static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) / (0.0 + 0.0j))) == 1);
-static_assert(__builtin_isinf_sign(__imag__((1.0 + __builtin_inf() * 1.0j) / (0.0 + 0.0j))) == 1);
-static_assert(__builtin_isinf_sign(__imag__((__builtin_inf() + __builtin_inf() * 1.0j) / (0.0 + 0.0j))) == 1);
-static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) / 0.0)) == 1);
-static_assert(__builtin_isinf_sign(__imag__((1.0 + __builtin_inf() * 1.0j) / 0.0)) == 1);
-static_assert(__builtin_isinf_sign(__imag__((__builtin_inf() + __builtin_inf() * 1.0j) / 0.0)) == 1);
+static_assert(__builtin_isinf_sign(__imag__((1.0 + InfC) / (0.0 + 0.0j))) == 1);
+static_assert(__builtin_isinf_sign(__imag__((InfInf) / (0.0 + 0.0j))) == 1);
+static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) / 0.0)) == 1); // expected-error {{static assertion}} \
+                                                                                    // expected-note {{division by zero}}
+static_assert(__builtin_isinf_sign(__imag__((1.0 + InfC) / 0.0)) == 1); // expected-error {{static assertion}} \
+                                                                        // expected-note {{division by zero}}
+static_assert(__builtin_isinf_sign(__imag__((InfInf) / 0.0)) == 1); // expected-error {{static assertion}} \
+                                                                    // expected-note {{division by zero}}
+
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index 80a7a2d..70ab5dc 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -82,7 +82,7 @@ constexpr void k() {
 
 // If the return type is not 'void', no return statements => never a constant
 // expression, so still diagnose that case.
-[[noreturn]] constexpr int fn() { // expected-error {{no return statement in constexpr function}}
+[[noreturn]] constexpr int fn() { // cxx14_20-error {{no return statement in constexpr function}}
   fn();
 }
 
diff --git a/clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp b/clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
new file mode 100644
index 0000000..25d1f8d
--- /dev/null
+++ b/clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
+
+constexpr int f() { } // expected-warning {{non-void function does not return a value}}
+static_assert(__is_same(decltype([] constexpr -> int { }( )), int)); // expected-warning {{non-void lambda does not return a value}}
+
+consteval int g() { } // expected-warning {{non-void function does not return a value}}
+static_assert(__is_same(decltype([] consteval -> int { }( )), int)); // expected-warning {{non-void lambda does not return a value}}
diff --git a/clang/test/SemaCXX/for-range-examples.cpp b/clang/test/SemaCXX/for-range-examples.cpp
index d129d50..c06bf01 100644
--- a/clang/test/SemaCXX/for-range-examples.cpp
+++ b/clang/test/SemaCXX/for-range-examples.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -fexperimental-new-constant-interpreter
 
 namespace value_range_detail {
   template<typename T>
diff --git a/clang/test/SemaCXX/integer-overflow.cpp b/clang/test/SemaCXX/integer-overflow.cpp
index 6049458..d1cc8be 100644
--- a/clang/test/SemaCXX/integer-overflow.cpp
+++ b/clang/test/SemaCXX/integer-overflow.cpp
@@ -1,6 +1,10 @@
 // RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++98 -triple x86_64-pc-linux-gnu
 // RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++2a -triple x86_64-pc-linux-gnu
 
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++98 -triple x86_64-pc-linux-gnu -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++2a -triple x86_64-pc-linux-gnu -fexperimental-new-constant-interpreter
+
+
 typedef unsigned long long uint64_t;
 typedef unsigned int uint32_t;
 
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
index f7f69e9..7675d42 100644
--- a/clang/test/SemaTemplate/cwg2398.cpp
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -200,8 +200,120 @@ namespace consistency {
     template struct A<B<int>, B<int>, B<int>>;
     // new-error@-1 {{ambiguous partial specializations}}
   } // namespace t2
+  namespace t3 {
+    template<class T1, class T2, class T3> struct A;
+
+    template<template<class, class> class TT1,
+             class T1, class T2, class T3, class T4>
+    struct A<TT1<T1, T2>, TT1<T3, T4>, typename nondeduced<TT1<T1, T2>>::type> {};
+    // new-note@-1 {{partial specialization matches}}
+
+    template<template<class> class UU1,
+             class U1, class U2>
+    struct A<UU1<U1>, UU1<U2>, typename nondeduced<UU1<U1>>::type>;
+    // new-note@-1 {{partial specialization matches}}
+
+    template struct A<B<int>, B<int>, B<int>>;
+    // new-error@-1 {{ambiguous partial specializations}}
+  } // namespace t3
+  namespace t4 {
+    template<class T1, class T2, class T3> struct A;
+
+    template<template<class, class> class TT1,
+             class T1, class T2, class T3, class T4>
+    struct A<TT1<T1, T2>, TT1<T3, T4>, typename nondeduced<TT1<T1, T4>>::type> {};
+    // new-note@-1 {{partial specialization matches}}
+
+    template<template<class> class UU1,
+             class U1, class U2>
+    struct A<UU1<U1>, UU1<U2>, typename nondeduced<UU1<U1>>::type>;
+    // new-note@-1 {{partial specialization matches}}
+
+    template struct A<B<int>, B<int>, B<int>>;
+    // new-error@-1 {{ambiguous partial specializations}}
+  } // namespace t4
+  namespace t5 {
+    template<class T1, class T2> struct A;
+
+    template<template<class, class> class TT1,
+             class T1, class T2, class T3, class T4>
+    struct A<TT1<T1, T2>, TT1<T3, T4>> {};
+    // new-note@-1 {{partial specialization matches}}
+
+    template<template<class> class UU1,
+             class U1, class U2>
+    struct A<UU1<U1>, UU1<U2>>;
+    // new-note@-1 {{partial specialization matches}}
+
+    template struct A<B<int>, B<int>>;
+    // new-error@-1 {{ambiguous partial specializations}}
+  } // namespace t5
+  namespace t6 {
+    template<class T1, class T2> struct A;
+
+    template<template<class, class> class TT1,
+             class T1, class T2, class T3>
+    struct A<TT1<T1, T2>, TT1<T1, T3>> {};
+    // new-note@-1 {{partial specialization matches}}
+
+    template<template<class> class UU1,
+             class U1, class U2>
+    struct A<UU1<U1>, UU1<U2>>;
+    // new-note@-1 {{partial specialization matches}}
+
+    template struct A<B<int>, B<int>>;
+    // new-error@-1 {{ambiguous partial specializations}}
+  } // namespace t6
 } // namespace consistency
 
+namespace classes {
+  namespace canon {
+    template<class T, class U> struct A {};
+
+    template<template<class> class TT> auto f(TT<int> a) { return a; }
+    // old-note@-1 2{{template template argument has different template parameters}}
+    // new-note@-2 2{{substitution failure: too few template arguments}}
+
+    A<int, float> v1;
+    A<int, double> v2;
+
+    using X = decltype(f(v1));
+    // expected-error@-1 {{no matching function for call}}
+
+    using X = decltype(f(v2));
+    // expected-error@-1 {{no matching function for call}}
+  } // namespace canon
+  namespace expr {
+    template <class T1, int E1> struct A {
+      static constexpr auto val = E1;
+    };
+    template <template <class T3> class TT> void f(TT<int> v) {
+      // old-note@-1 {{template template argument has different template parameters}}
+      // new-note@-2 {{substitution failure: too few template arguments}}
+      static_assert(v.val == 3);
+    };
+    void test() {
+      f(A<int, 3>());
+      // expected-error@-1 {{no matching function for call}}
+    }
+  } // namespace expr
+  namespace packs {
+    template <class T1, class ...T2s> struct A {
+      static constexpr auto val = sizeof...(T2s);
+    };
+
+    template <template <class T3> class TT> void f(TT<int> v) {
+      // old-note@-1 {{template template argument has different template parameters}}
+      // new-note@-2 {{deduced type 'A<[...], (no argument), (no argument), (no argument)>' of 1st parameter does not match adjusted type 'A<[...], void, void, void>' of argument [with TT = A]}}
+      static_assert(v.val == 3);
+    };
+    void test() {
+      f(A<int, void, void, void>());
+      // expected-error@-1 {{no matching function for call}}
+    }
+  } // namespace packs
+} // namespace classes
+
 namespace regression1 {
   template <typename T, typename Y> struct map {};
   template <typename T> class foo {};
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 4017b14..42618e4 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -61,10 +61,7 @@ clang_target_link_libraries(clang-repl PRIVATE
   clangInterpreter
   )
 
-# Support plugins.
-if(CLANG_PLUGIN_SUPPORT)
-  export_executable_symbols_for_plugins(clang-repl)
-endif()
+export_executable_symbols_for_plugins(clang-repl)
 
 # The clang-repl binary can get huge with static linking in debug mode.
 # Some 32-bit targets use PLT slots with limited branch range by default and we
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index dbc1916..fb57333 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -22866,6 +22866,22 @@ TEST_F(FormatTest, FormatsLambdas) {
       "      //\n"
       "    });");
 
+  FormatStyle LLVMStyle = getLLVMStyleWithColumns(60);
+  verifyFormat("very_long_function_name_yes_it_is_really_long(\n"
+               "    [](auto n) noexcept [[back_attr]]\n"
+               "        -> std::unordered_map<very_long_type_name_A,\n"
+               "                              very_long_type_name_B> {\n"
+               "      really_do_something();\n"
+               "    });",
+               LLVMStyle);
+  verifyFormat("very_long_function_name_yes_it_is_really_long(\n"
+               "    [](auto n) constexpr\n"
+               "        -> std::unordered_map<very_long_type_name_A,\n"
+               "                              very_long_type_name_B> {\n"
+               "      really_do_something();\n"
+               "    });",
+               LLVMStyle);
+
   FormatStyle DoNotMerge = getLLVMStyle();
   DoNotMerge.AllowShortLambdasOnASingleLine = FormatStyle::SLS_None;
   verifyFormat("auto c = []() {\n"
diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index 72b34da..683295a 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -283,7 +283,7 @@ TEST_F(InterpreterTest, InstantiateTemplate) {
 }
 
 // This test exposes an ARM specific problem in the interpreter, see
-// https://github.com/llvm/llvm-project/issues/94741.
+// https://github.com/llvm/llvm-project/issues/94994.
 #ifndef __arm__
 TEST_F(InterpreterTest, Value) {
   std::unique_ptr<Interpreter> Interp = createInterpreter();
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 4385744..5e2ab06 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -17186,6 +17186,18 @@ objects</td>
     <td>open</td>
     <td>Template argument deduction involving exception specifications</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2897">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2897.html">2897</a></td>
+    <td>open</td>
+    <td>Copying potentially-overlapping union subobjects</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2898">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2898.html">2898</a></td>
+    <td>open</td>
+    <td>Clarify implicit conversion sequence from <I>cv</I> <TT>T</TT> to <TT>T</TT></td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cpp b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cpp
index c90d663..b058e3c 100644
--- a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cpp
+++ b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cpp
@@ -310,16 +310,14 @@ TEST(FunctionCallTrieTest, MergeInto) {
 
 TEST(FunctionCallTrieTest, PlacementNewOnAlignedStorage) {
   profilingFlags()->setDefaults();
-  typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators),
-                                alignof(FunctionCallTrie::Allocators)>::type
-      AllocatorsStorage;
+  alignas(FunctionCallTrie::Allocators)
+      std::byte AllocatorsStorage[sizeof(FunctionCallTrie::Allocators)];
   new (&AllocatorsStorage)
       FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators());
   auto *A =
       reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage);
 
-  typename std::aligned_storage<sizeof(FunctionCallTrie),
-                                alignof(FunctionCallTrie)>::type FCTStorage;
+  alignas(FunctionCallTrie) std::byte FCTStorage[sizeof(FunctionCallTrie)];
   new (&FCTStorage) FunctionCallTrie(*A);
   auto *T = reinterpret_cast<FunctionCallTrie *>(&FCTStorage);
 
diff --git a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cpp b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cpp
index eab5579..da50642 100644
--- a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cpp
+++ b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cpp
@@ -38,8 +38,8 @@ struct ExpectedProfilingFileHeader {
 void ValidateFileHeaderBlock(XRayBuffer B) {
   ASSERT_NE(static_cast<const void *>(B.Data), nullptr);
   ASSERT_EQ(B.Size, sizeof(ExpectedProfilingFileHeader));
-  typename std::aligned_storage<sizeof(ExpectedProfilingFileHeader)>::type
-      FileHeaderStorage;
+  alignas(ExpectedProfilingFileHeader)
+      std::byte FileHeaderStorage[sizeof(ExpectedProfilingFileHeader)];
   ExpectedProfilingFileHeader ExpectedHeader;
   std::memcpy(&FileHeaderStorage, B.Data, B.Size);
   auto &FileHeader =
diff --git a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cpp b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cpp
index 46aeb88..26c80de 100644
--- a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cpp
+++ b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cpp
@@ -226,13 +226,11 @@ TEST(SegmentedArrayTest, SimulateStackBehaviour) {
 
 TEST(SegmentedArrayTest, PlacementNewOnAlignedStorage) {
   using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
-  typename std::aligned_storage<sizeof(AllocatorType),
-                                alignof(AllocatorType)>::type AllocatorStorage;
+  alignas(AllocatorType) std::byte AllocatorStorage[sizeof(AllocatorType)];
   new (&AllocatorStorage) AllocatorType(1 << 10);
   auto *A = reinterpret_cast<AllocatorType *>(&AllocatorStorage);
-  typename std::aligned_storage<sizeof(Array<ShadowStackEntry>),
-                                alignof(Array<ShadowStackEntry>)>::type
-      ArrayStorage;
+  alignas(Array<ShadowStackEntry>)
+      std::byte ArrayStorage[sizeof(Array<ShadowStackEntry>)];
   new (&ArrayStorage) Array<ShadowStackEntry>(*A);
   auto *Data = reinterpret_cast<Array<ShadowStackEntry> *>(&ArrayStorage);
 
diff --git a/compiler-rt/lib/xray/tests/unit/test_helpers.cpp b/compiler-rt/lib/xray/tests/unit/test_helpers.cpp
index 6075f36..81a93d8 100644
--- a/compiler-rt/lib/xray/tests/unit/test_helpers.cpp
+++ b/compiler-rt/lib/xray/tests/unit/test_helpers.cpp
@@ -69,8 +69,7 @@ namespace __xray {
 
 std::string serialize(BufferQueue &Buffers, int32_t Version) {
   std::string Serialized;
-  std::aligned_storage<sizeof(XRayFileHeader), alignof(XRayFileHeader)>::type
-      HeaderStorage;
+  alignas(XRayFileHeader) std::byte HeaderStorage[sizeof(XRayFileHeader)];
   auto *Header = reinterpret_cast<XRayFileHeader *>(&HeaderStorage);
   new (Header) XRayFileHeader();
   Header->Version = Version;
diff --git a/compiler-rt/lib/xray/xray_fdr_logging.cpp b/compiler-rt/lib/xray/xray_fdr_logging.cpp
index 378a8c0f..7def356 100644
--- a/compiler-rt/lib/xray/xray_fdr_logging.cpp
+++ b/compiler-rt/lib/xray/xray_fdr_logging.cpp
@@ -55,17 +55,12 @@ struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
   BufferQueue::Buffer Buffer{};
   BufferQueue *BQ = nullptr;
 
-  using LogWriterStorage =
-      typename std::aligned_storage<sizeof(FDRLogWriter),
-                                    alignof(FDRLogWriter)>::type;
-
-  LogWriterStorage LWStorage;
+  using LogWriterStorage = std::byte[sizeof(FDRLogWriter)];
+  alignas(FDRLogWriter) LogWriterStorage LWStorage;
   FDRLogWriter *Writer = nullptr;
 
-  using ControllerStorage =
-      typename std::aligned_storage<sizeof(FDRController<>),
-                                    alignof(FDRController<>)>::type;
-  ControllerStorage CStorage;
+  using ControllerStorage = std::byte[sizeof(FDRController<>)];
+  alignas(FDRController<>) ControllerStorage CStorage;
   FDRController<> *Controller = nullptr;
 };
 
@@ -78,7 +73,7 @@ static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
 static pthread_key_t Key;
 
 // Global BufferQueue.
-static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage;
+static std::byte BufferQueueStorage[sizeof(BufferQueue)];
 static BufferQueue *BQ = nullptr;
 
 // Global thresholds for function durations.
@@ -129,8 +124,8 @@ static_assert(alignof(ThreadLocalData) >= 64,
               "ThreadLocalData must be cache line aligned.");
 #endif
 static ThreadLocalData &getThreadLocalData() {
-  thread_local typename std::aligned_storage<
-      sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{};
+  alignas(ThreadLocalData) thread_local std::byte
+      TLDStorage[sizeof(ThreadLocalData)];
 
   if (pthread_getspecific(Key) == NULL) {
     new (reinterpret_cast<ThreadLocalData *>(&TLDStorage)) ThreadLocalData{};
diff --git a/compiler-rt/lib/xray/xray_function_call_trie.h b/compiler-rt/lib/xray/xray_function_call_trie.h
index b8c6058..7536f39 100644
--- a/compiler-rt/lib/xray/xray_function_call_trie.h
+++ b/compiler-rt/lib/xray/xray_function_call_trie.h
@@ -139,18 +139,14 @@ public:
 
     // Use hosted aligned storage members to allow for trivial move and init.
     // This also allows us to sidestep the potential-failing allocation issue.
-    typename std::aligned_storage<sizeof(NodeAllocatorType),
-                                  alignof(NodeAllocatorType)>::type
-        NodeAllocatorStorage;
-    typename std::aligned_storage<sizeof(RootAllocatorType),
-                                  alignof(RootAllocatorType)>::type
-        RootAllocatorStorage;
-    typename std::aligned_storage<sizeof(ShadowStackAllocatorType),
-                                  alignof(ShadowStackAllocatorType)>::type
-        ShadowStackAllocatorStorage;
-    typename std::aligned_storage<sizeof(NodeIdPairAllocatorType),
-                                  alignof(NodeIdPairAllocatorType)>::type
-        NodeIdPairAllocatorStorage;
+    alignas(NodeAllocatorType) std::byte
+        NodeAllocatorStorage[sizeof(NodeAllocatorType)];
+    alignas(RootAllocatorType) std::byte
+        RootAllocatorStorage[sizeof(RootAllocatorType)];
+    alignas(ShadowStackAllocatorType) std::byte
+        ShadowStackAllocatorStorage[sizeof(ShadowStackAllocatorType)];
+    alignas(NodeIdPairAllocatorType) std::byte
+        NodeIdPairAllocatorStorage[sizeof(NodeIdPairAllocatorType)];
 
     NodeAllocatorType *NodeAllocator = nullptr;
     RootAllocatorType *RootAllocator = nullptr;
diff --git a/compiler-rt/lib/xray/xray_profile_collector.cpp b/compiler-rt/lib/xray/xray_profile_collector.cpp
index bef2504..3a28240 100644
--- a/compiler-rt/lib/xray/xray_profile_collector.cpp
+++ b/compiler-rt/lib/xray/xray_profile_collector.cpp
@@ -29,7 +29,7 @@ namespace {
 SpinMutex GlobalMutex;
 struct ThreadTrie {
   tid_t TId;
-  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
+  alignas(FunctionCallTrie) std::byte TrieStorage[sizeof(FunctionCallTrie)];
 };
 
 struct ProfileBuffer {
@@ -71,16 +71,13 @@ using ThreadDataAllocator = ThreadDataArray::AllocatorType;
 // by the ThreadData array. This lets us host the buffers, allocators, and tries
 // associated with a thread by moving the data into the array instead of
 // attempting to copy the data to a separately backed set of tries.
-static typename std::aligned_storage<
-    sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
+alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)];
 static BufferQueue *BQ = nullptr;
 static BufferQueue::Buffer Buffer;
-static typename std::aligned_storage<sizeof(ThreadDataAllocator),
-                                     alignof(ThreadDataAllocator)>::type
-    ThreadDataAllocatorStorage;
-static typename std::aligned_storage<sizeof(ThreadDataArray),
-                                     alignof(ThreadDataArray)>::type
-    ThreadDataArrayStorage;
+alignas(ThreadDataAllocator) static std::byte
+    ThreadDataAllocatorStorage[sizeof(ThreadDataAllocator)];
+alignas(ThreadDataArray) static std::byte
+    ThreadDataArrayStorage[sizeof(ThreadDataArray)];
 
 static ThreadDataAllocator *TDAllocator = nullptr;
 static ThreadDataArray *TDArray = nullptr;
@@ -91,10 +88,10 @@ using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
 // These need to be global aligned storage to avoid dynamic initialization. We
 // need these to be aligned to allow us to placement new objects into the
 // storage, and have pointers to those objects be appropriately aligned.
-static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
-    ProfileBuffersStorage;
-static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
-    ProfileBufferArrayAllocatorStorage;
+alignas(ProfileBufferArray) static std::byte
+    ProfileBuffersStorage[sizeof(ProfileBufferArray)];
+alignas(ProfileBufferArrayAllocator) static std::byte
+    ProfileBufferArrayAllocatorStorage[sizeof(ProfileBufferArrayAllocator)];
 
 static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
 static ProfileBufferArray *ProfileBuffers = nullptr;
@@ -382,8 +379,8 @@ XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
     return {nullptr, 0};
 
   static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
-      FileHeaderStorage;
+  alignas(XRayProfilingFileHeader) static std::byte
+      FileHeaderStorage[sizeof(XRayProfilingFileHeader)];
   pthread_once(
       &Once, +[]() XRAY_NEVER_INSTRUMENT {
         new (&FileHeaderStorage) XRayProfilingFileHeader{};
diff --git a/compiler-rt/lib/xray/xray_profiling.cpp b/compiler-rt/lib/xray/xray_profiling.cpp
index 259ec65..e9ac2fd 100644
--- a/compiler-rt/lib/xray/xray_profiling.cpp
+++ b/compiler-rt/lib/xray/xray_profiling.cpp
@@ -48,17 +48,14 @@ static pthread_key_t ProfilingKey;
 
 // We use a global buffer queue, which gets initialized once at initialisation
 // time, and gets reset when profiling is "done".
-static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type
-    BufferQueueStorage;
+alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)];
 static BufferQueue *BQ = nullptr;
 
 thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers;
-thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators),
-                                  alignof(FunctionCallTrie::Allocators)>::type
-    AllocatorsStorage;
-thread_local std::aligned_storage<sizeof(FunctionCallTrie),
-                                  alignof(FunctionCallTrie)>::type
-    FunctionCallTrieStorage;
+alignas(FunctionCallTrie::Allocators) thread_local std::byte
+    AllocatorsStorage[sizeof(FunctionCallTrie::Allocators)];
+alignas(FunctionCallTrie) thread_local std::byte
+    FunctionCallTrieStorage[sizeof(FunctionCallTrie)];
 thread_local ProfilingData TLD{{0}, {0}};
 thread_local atomic_uint8_t ReentranceGuard{0};
 
diff --git a/compiler-rt/lib/xray/xray_segmented_array.h b/compiler-rt/lib/xray/xray_segmented_array.h
index 6eb673e..3ab174b 100644
--- a/compiler-rt/lib/xray/xray_segmented_array.h
+++ b/compiler-rt/lib/xray/xray_segmented_array.h
@@ -56,8 +56,7 @@ public:
   //     kCacheLineSize-multiple segments, minus the size of two pointers.
   //
   //   - Request cacheline-multiple sized elements from the allocator.
-  static constexpr uint64_t AlignedElementStorageSize =
-      sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type);
+  static constexpr uint64_t AlignedElementStorageSize = sizeof(T);
 
   static constexpr uint64_t SegmentControlBlockSize = sizeof(Segment *) * 2;
 
diff --git a/compiler-rt/test/dfsan/release_shadow_space.c b/compiler-rt/test/dfsan/release_shadow_space.c
index 60dec98..0f0e1a9 100644
--- a/compiler-rt/test/dfsan/release_shadow_space.c
+++ b/compiler-rt/test/dfsan/release_shadow_space.c
@@ -3,9 +3,6 @@
 // DFSAN_OPTIONS=no_huge_pages_for_shadow=false RUN: %clang_dfsan %s -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 -o %t && %run %t
 // DFSAN_OPTIONS=no_huge_pages_for_shadow=true RUN: %clang_dfsan %s -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 -o %t && %run %t
 
-// This test is flaky right now: https://github.com/llvm/llvm-project/issues/91287
-// UNSUPPORTED:  target={{.*}}
-
 #include <assert.h>
 #include <sanitizer/dfsan_interface.h>
 #include <stdbool.h>
@@ -26,7 +23,11 @@ size_t get_rss_kb() {
   char buf[256];
   while (fgets(buf, sizeof(buf), f) != NULL) {
     int64_t rss;
-    if (sscanf(buf, "Rss: %ld kB", &rss) == 1)
+    // DFSan's sscanf is broken and doesn't check for ordinary characters in
+    // the format string, hence we use strstr as a secondary check
+    // (https://github.com/llvm/llvm-project/issues/94769).
+    if ((sscanf(buf, "Rss: %ld kB", &rss) == 1) &&
+        (strstr(buf, "Rss: ") != NULL))
       ret += rss;
   }
   assert(feof(f));
@@ -73,6 +74,11 @@ int main(int argc, char **argv) {
       before, after_mmap, after_mmap_and_set_label, after_fixed_mmap,
       after_mmap_and_set_label2, after_munmap);
 
+  // This is orders of magnitude larger than we expect (typically < 10,000KB).
+  // It is a quick check to ensure that the RSS calculation function isn't
+  // egregriously wrong.
+  assert(before < 1000000);
+
   const size_t mmap_cost_kb = map_size >> 10;
   // Shadow space (1:1 with application memory)
   const size_t mmap_shadow_cost_kb = sizeof(dfsan_label) * mmap_cost_kb;
diff --git a/compiler-rt/test/tsan/custom_mutex4.cpp b/compiler-rt/test/tsan/custom_mutex4.cpp
index 539a8be..f7dfab0 100644
--- a/compiler-rt/test/tsan/custom_mutex4.cpp
+++ b/compiler-rt/test/tsan/custom_mutex4.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx_tsan -O1 --std=c++11 %s -o %t && %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_tsan -O1 --std=c++17 %s -o %t && %run %t 2>&1 | FileCheck %s
 #include "custom_mutex.h"
 
-#include <type_traits>
+#include <cstddef>
 
 // Test that the destruction events of a mutex are ignored when the
 // annotations request this.
@@ -12,14 +12,14 @@
 // has run.
 
 int main() {
-  std::aligned_storage<sizeof(Mutex), alignof(Mutex)>::type mu1_store;
+  alignas(Mutex) std::byte mu1_store[sizeof(Mutex)];
   Mutex* mu1 = reinterpret_cast<Mutex*>(&mu1_store);
   new(&mu1_store) Mutex(false, __tsan_mutex_linker_init);
   mu1->Lock();
   mu1->~Mutex();
   mu1->Unlock();
 
-  std::aligned_storage<sizeof(Mutex), alignof(Mutex)>::type mu2_store;
+  alignas(Mutex) std::byte mu2_store[sizeof(Mutex)];
   Mutex* mu2 = reinterpret_cast<Mutex*>(&mu2_store);
   new(&mu2_store) Mutex(false, 0, __tsan_mutex_not_static);
   mu2->Lock();
diff --git a/compiler-rt/test/tsan/custom_mutex5.cpp b/compiler-rt/test/tsan/custom_mutex5.cpp
index cb18b23..6d65829 100644
--- a/compiler-rt/test/tsan/custom_mutex5.cpp
+++ b/compiler-rt/test/tsan/custom_mutex5.cpp
@@ -1,20 +1,20 @@
-// RUN: %clangxx_tsan -O1 --std=c++11 %s -o %t && %deflake %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_tsan -O1 --std=c++17 %s -o %t && %deflake %run %t 2>&1 | FileCheck %s
 #include "custom_mutex.h"
 
-#include <type_traits>
+#include <cstddef>
 
 // Test that we detect the destruction of an in-use mutex when the
 // thread annotations don't otherwise disable the check.
 
 int main() {
-  std::aligned_storage<sizeof(Mutex), alignof(Mutex)>::type mu1_store;
+  alignas(Mutex) std::byte mu1_store[sizeof(Mutex)];
   Mutex* mu1 = reinterpret_cast<Mutex*>(&mu1_store);
   new(&mu1_store) Mutex(false, 0);
   mu1->Lock();
   mu1->~Mutex();
   mu1->Unlock();
 
-  std::aligned_storage<sizeof(Mutex), alignof(Mutex)>::type mu2_store;
+  alignas(Mutex) std::byte mu2_store[sizeof(Mutex)];
   Mutex* mu2 = reinterpret_cast<Mutex*>(&mu2_store);
   new(&mu2_store)
       Mutex(false, __tsan_mutex_not_static, __tsan_mutex_not_static);
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
index 0e20cfb..17587b3f 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
@@ -10,6 +10,7 @@ import abc
 import imp
 import os
 import sys
+from enum import IntEnum
 from pathlib import PurePath, Path
 from collections import defaultdict, namedtuple
 
@@ -37,6 +38,26 @@ def _load_com_module():
 VSBreakpoint = namedtuple("VSBreakpoint", "path, line, col, cond")
 
 
+# Visual Studio events.
+# https://learn.microsoft.com/en-us/dotnet/api/envdte.dbgeventreason?view=visualstudiosdk-2022
+class DbgEvent(IntEnum):
+    dbgEventReasonNone = 1
+    dbgEventReasonGo = 2
+    dbgEventReasonAttachProgram = 3
+    dbgEventReasonDetachProgram = 4
+    dbgEventReasonLaunchProgram = 5
+    dbgEventReasonEndProgram = 6
+    dbgEventReasonStopDebugging = 7
+    dbgEventReasonStep = 8
+    dbgEventReasonBreakpoint = 9
+    dbgEventReasonExceptionThrown = 10
+    dbgEventReasonExceptionNotHandled = 11
+    dbgEventReasonUserBreak = 12
+    dbgEventReasonContextSwitch = 13
+
+    first = dbgEventReasonNone
+    last = dbgEventReasonContextSwitch
+
 class VisualStudio(
     DebuggerBase, metaclass=abc.ABCMeta
 ):  # pylint: disable=abstract-method
@@ -307,6 +328,20 @@ class VisualStudio(
                 )
             )
 
+    def _translate_stop_reason(self, reason):
+        if reason == DbgEvent.dbgEventReasonNone:
+            return None
+        if reason == DbgEvent.dbgEventReasonBreakpoint:
+            return StopReason.BREAKPOINT
+        if reason == DbgEvent.dbgEventReasonStep:
+            return StopReason.STEP
+        if reason == DbgEvent.dbgEventReasonEndProgram:
+            return StopReason.PROGRAM_EXIT
+        if reason == DbgEvent.dbgEventReasonExceptionNotHandled:
+            return StopReason.ERROR
+        assert reason <= DbgEvent.last and reason >= DbgEvent.first
+        return StopReason.OTHER
+
     def _get_step_info(self, watches, step_index):
         thread = self._debugger.CurrentThread
         stackframes = thread.StackFrames
@@ -347,16 +382,13 @@ class VisualStudio(
             frames[0].loc = loc
             state_frames[0].location = SourceLocation(**self._location)
 
-        reason = StopReason.BREAKPOINT
-        if loc.path is None:  # pylint: disable=no-member
-            reason = StopReason.STEP
-
+        stop_reason = self._translate_stop_reason(self._debugger.LastBreakReason)
         program_state = ProgramState(frames=state_frames)
 
         return StepIR(
             step_index=step_index,
             frames=frames,
-            stop_reason=reason,
+            stop_reason=stop_reason,
             program_state=program_state,
         )
 
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h b/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h
index 132592a..5f14d77 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h
@@ -32,6 +32,12 @@ mlir::Value genLboundDim(fir::FirOpBuilder &builder, mlir::Location loc,
 void genUbound(fir::FirOpBuilder &builder, mlir::Location loc,
                mlir::Value resultBox, mlir::Value array, mlir::Value kind);
 
+/// Generate call to `Shape` runtime routine.
+/// First argument is a raw pointer to the result array storage that
+/// must be allocated by the caller.
+void genShape(fir::FirOpBuilder &builder, mlir::Location loc,
+              mlir::Value resultAddr, mlir::Value arrayt, mlir::Value kind);
+
 /// Generate call to `Size` runtime routine. This routine is a specialized
 /// version when the DIM argument is not specified by the user.
 mlir::Value genSize(fir::FirOpBuilder &builder, mlir::Location loc,
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 4e978e6..9fa819e 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -55,6 +55,7 @@ namespace fir {
 #define GEN_PASS_DECL_OMPMARKDECLARETARGETPASS
 #define GEN_PASS_DECL_OMPFUNCTIONFILTERING
 #define GEN_PASS_DECL_VSCALEATTR
+#define GEN_PASS_DECL_FUNCTIONATTR
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createAffineDemotionPass();
@@ -75,17 +76,6 @@ std::unique_ptr<mlir::Pass> createVScaleAttrPass();
 std::unique_ptr<mlir::Pass>
 createVScaleAttrPass(std::pair<unsigned, unsigned> vscaleAttr);
 
-struct FunctionAttrTypes {
-  mlir::LLVM::framePointerKind::FramePointerKind framePointerKind =
-      mlir::LLVM::framePointerKind::FramePointerKind::None;
-};
-
-std::unique_ptr<mlir::Pass> createFunctionAttrPass();
-std::unique_ptr<mlir::Pass>
-createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath,
-                       bool noNaNsFPMath, bool approxFuncFPMath,
-                       bool noSignedZerosFPMath, bool unsafeFPMath);
-
 void populateCfgConversionRewrites(mlir::RewritePatternSet &patterns,
                                    bool forceLoopToExecuteOnce = false,
                                    bool setNSW = false);
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 4e281e28..7a3baca 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -394,7 +394,6 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
            "bool", /*default=*/"false",
            "Set the unsafe-fp-math attribute on functions in the module.">,
   ];
-  let constructor = "::fir::createFunctionAttrPass()";
 }
 
 def AssumedRankOpConversion : Pass<"fir-assumed-rank-op", "mlir::ModuleOp"> {
diff --git a/flang/include/flang/Runtime/inquiry.h b/flang/include/flang/Runtime/inquiry.h
index 7161d1e..dde6e72 100644
--- a/flang/include/flang/Runtime/inquiry.h
+++ b/flang/include/flang/Runtime/inquiry.h
@@ -24,8 +24,11 @@ extern "C" {
 std::int64_t RTDECL(LboundDim)(const Descriptor &array, int dim,
     const char *sourceFile = nullptr, int line = 0);
 
-void RTDECL(Shape)(void *result, const Descriptor &array, int kind);
+void RTDECL(Lbound)(void *result, const Descriptor &array, int kind,
+    const char *sourceFile = nullptr, int line = 0);
 
+void RTDECL(Shape)(void *result, const Descriptor &array, int kind,
+    const char *sourceFile = nullptr, int line = 0);
 std::int64_t RTDECL(Size)(
     const Descriptor &array, const char *sourceFile = nullptr, int line = 0);
 
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 528c51d..2a0cfc0 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -372,24 +372,22 @@ inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
     pm.addPass(fir::createVScaleAttr({{config.VScaleMin, config.VScaleMax}}));
 
   // Add function attributes
-  fir::FunctionAttrTypes functionAttrs;
+  mlir::LLVM::framePointerKind::FramePointerKind framePointerKind;
 
   if (config.FramePointerKind != llvm::FramePointerKind::None ||
       config.NoInfsFPMath || config.NoNaNsFPMath || config.ApproxFuncFPMath ||
       config.NoSignedZerosFPMath || config.UnsafeFPMath) {
     if (config.FramePointerKind == llvm::FramePointerKind::NonLeaf)
-      functionAttrs.framePointerKind =
+      framePointerKind =
           mlir::LLVM::framePointerKind::FramePointerKind::NonLeaf;
     else if (config.FramePointerKind == llvm::FramePointerKind::All)
-      functionAttrs.framePointerKind =
-          mlir::LLVM::framePointerKind::FramePointerKind::All;
+      framePointerKind = mlir::LLVM::framePointerKind::FramePointerKind::All;
     else
-      functionAttrs.framePointerKind =
-          mlir::LLVM::framePointerKind::FramePointerKind::None;
+      framePointerKind = mlir::LLVM::framePointerKind::FramePointerKind::None;
 
-    pm.addPass(fir::createFunctionAttrPass(functionAttrs, config.NoInfsFPMath,
+    pm.addPass(fir::createFunctionAttr({framePointerKind, config.NoInfsFPMath,
         config.NoNaNsFPMath, config.ApproxFuncFPMath,
-        config.NoSignedZerosFPMath, config.UnsafeFPMath));
+        config.NoSignedZerosFPMath, config.UnsafeFPMath}));
   }
 
   fir::addFIRToLLVMPass(pm, config);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 512c7a3..202efa5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -104,7 +104,7 @@ struct IncrementLoopInfo {
 
   bool hasLocalitySpecs() const {
     return !localSymList.empty() || !localInitSymList.empty() ||
-           !sharedSymList.empty();
+           !reduceSymList.empty() || !sharedSymList.empty();
   }
 
   // Data members common to both structured and unstructured loops.
@@ -116,6 +116,9 @@ struct IncrementLoopInfo {
   bool isUnordered; // do concurrent, forall
   llvm::SmallVector<const Fortran::semantics::Symbol *> localSymList;
   llvm::SmallVector<const Fortran::semantics::Symbol *> localInitSymList;
+  llvm::SmallVector<
+      std::pair<fir::ReduceOperationEnum, const Fortran::semantics::Symbol *>>
+      reduceSymList;
   llvm::SmallVector<const Fortran::semantics::Symbol *> sharedSymList;
   mlir::Value loopVariable = nullptr;
 
@@ -1741,6 +1744,35 @@ private:
     builder->create<fir::UnreachableOp>(loc);
   }
 
+  fir::ReduceOperationEnum
+  getReduceOperationEnum(const Fortran::parser::ReductionOperator &rOpr) {
+    switch (rOpr.v) {
+    case Fortran::parser::ReductionOperator::Operator::Plus:
+      return fir::ReduceOperationEnum::Add;
+    case Fortran::parser::ReductionOperator::Operator::Multiply:
+      return fir::ReduceOperationEnum::Multiply;
+    case Fortran::parser::ReductionOperator::Operator::And:
+      return fir::ReduceOperationEnum::AND;
+    case Fortran::parser::ReductionOperator::Operator::Or:
+      return fir::ReduceOperationEnum::OR;
+    case Fortran::parser::ReductionOperator::Operator::Eqv:
+      return fir::ReduceOperationEnum::EQV;
+    case Fortran::parser::ReductionOperator::Operator::Neqv:
+      return fir::ReduceOperationEnum::NEQV;
+    case Fortran::parser::ReductionOperator::Operator::Max:
+      return fir::ReduceOperationEnum::MAX;
+    case Fortran::parser::ReductionOperator::Operator::Min:
+      return fir::ReduceOperationEnum::MIN;
+    case Fortran::parser::ReductionOperator::Operator::Iand:
+      return fir::ReduceOperationEnum::IAND;
+    case Fortran::parser::ReductionOperator::Operator::Ior:
+      return fir::ReduceOperationEnum::IOR;
+    case Fortran::parser::ReductionOperator::Operator::Ieor:
+      return fir::ReduceOperationEnum::EIOR;
+    }
+    llvm_unreachable("illegal reduction operator");
+  }
+
   /// Collect DO CONCURRENT or FORALL loop control information.
   IncrementLoopNestInfo getConcurrentControl(
       const Fortran::parser::ConcurrentHeader &header,
@@ -1763,6 +1795,16 @@ private:
               std::get_if<Fortran::parser::LocalitySpec::LocalInit>(&x.u))
         for (const Fortran::parser::Name &x : localInitList->v)
           info.localInitSymList.push_back(x.symbol);
+      if (const auto *reduceList =
+              std::get_if<Fortran::parser::LocalitySpec::Reduce>(&x.u)) {
+        fir::ReduceOperationEnum reduce_operation = getReduceOperationEnum(
+            std::get<Fortran::parser::ReductionOperator>(reduceList->t));
+        for (const Fortran::parser::Name &x :
+             std::get<std::list<Fortran::parser::Name>>(reduceList->t)) {
+          info.reduceSymList.push_back(
+              std::make_pair(reduce_operation, x.symbol));
+        }
+      }
       if (const auto *sharedList =
               std::get_if<Fortran::parser::LocalitySpec::Shared>(&x.u))
         for (const Fortran::parser::Name &x : sharedList->v)
@@ -1955,9 +1997,23 @@ private:
         mlir::Type loopVarType = info.getLoopVariableType();
         mlir::Value loopValue;
         if (info.isUnordered) {
+          llvm::SmallVector<mlir::Value> reduceOperands;
+          llvm::SmallVector<mlir::Attribute> reduceAttrs;
+          // Create DO CONCURRENT reduce operands and attributes
+          for (const auto &reduceSym : info.reduceSymList) {
+            const fir::ReduceOperationEnum reduce_operation = reduceSym.first;
+            const Fortran::semantics::Symbol *sym = reduceSym.second;
+            fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
+            reduceOperands.push_back(fir::getBase(exv));
+            auto reduce_attr =
+                fir::ReduceAttr::get(builder->getContext(), reduce_operation);
+            reduceAttrs.push_back(reduce_attr);
+          }
           // The loop variable value is explicitly updated.
           info.doLoop = builder->create<fir::DoLoopOp>(
-              loc, lowerValue, upperValue, stepValue, /*unordered=*/true);
+              loc, lowerValue, upperValue, stepValue, /*unordered=*/true,
+              /*finalCountValue=*/false, /*iterArgs=*/std::nullopt,
+              llvm::ArrayRef<mlir::Value>(reduceOperands), reduceAttrs);
           builder->setInsertionPointToStart(info.doLoop.getBody());
           loopValue = builder->createConvert(loc, loopVarType,
                                              info.doLoop.getInductionVar());
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 31e2c40..8d0ae2f 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -882,10 +882,15 @@ static fir::ExtendedValue translateVariableToExtendedValue(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity variable,
     bool forceHlfirBase = false, bool contiguousHint = false) {
   assert(variable.isVariable() && "must be a variable");
-  /// When going towards FIR, use the original base value to avoid
-  /// introducing descriptors at runtime when they are not required.
-  mlir::Value base =
-      forceHlfirBase ? variable.getBase() : variable.getFirBase();
+  // When going towards FIR, use the original base value to avoid
+  // introducing descriptors at runtime when they are not required.
+  // This is not done for assumed-rank since the fir::ExtendedValue cannot
+  // held the related lower bounds in an vector. The lower bounds of the
+  // descriptor must always be used instead.
+
+  mlir::Value base = (forceHlfirBase || variable.isAssumedRank())
+                         ? variable.getBase()
+                         : variable.getFirBase();
   if (variable.isMutableBox())
     return fir::MutableBoxValue(base, getExplicitTypeParams(variable),
                                 fir::MutableProperties{});
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 861b26d..b3e1ee3 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -5992,15 +5992,45 @@ mlir::Value IntrinsicLibrary::genSetExponent(mlir::Type resultType,
                                    fir::getBase(args[1])));
 }
 
+/// Generate runtime call to inquire about all the bounds/extents of an
+/// assumed-rank array.
+template <typename Func>
+static fir::ExtendedValue genAssumedRankBoundInquiry(
+    fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type resultType,
+    llvm::ArrayRef<fir::ExtendedValue> args, int kindPos, Func genRtCall) {
+  const fir::ExtendedValue &array = args[0];
+  // Allocate an array with the maximum rank, that is big enough to hold the
+  // result but still "small" (15 elements). Static size alloca make stack
+  // analysis/manipulation easier.
+  mlir::Type resultElementType = fir::unwrapSequenceType(resultType);
+  mlir::Type allocSeqType =
+      fir::SequenceType::get({Fortran::common::maxRank}, resultElementType);
+  mlir::Value resultStorage = builder.createTemporary(loc, allocSeqType);
+  mlir::Value arrayBox = builder.createBox(loc, array);
+  mlir::Value kind = isStaticallyAbsent(args, kindPos)
+                         ? builder.createIntegerConstant(
+                               loc, builder.getI32Type(),
+                               builder.getKindMap().defaultIntegerKind())
+                         : fir::getBase(args[kindPos]);
+  genRtCall(builder, loc, resultStorage, arrayBox, kind);
+  mlir::Type baseType =
+      fir::ReferenceType::get(builder.getVarLenSeqTy(resultElementType));
+  mlir::Value resultBase = builder.createConvert(loc, baseType, resultStorage);
+  mlir::Value rank =
+      builder.create<fir::BoxRankOp>(loc, builder.getIndexType(), arrayBox);
+  return fir::ArrayBoxValue{resultBase, {rank}};
+}
+
 // SHAPE
 fir::ExtendedValue
 IntrinsicLibrary::genShape(mlir::Type resultType,
                            llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() >= 1);
   const fir::ExtendedValue &array = args[0];
+  if (array.hasAssumedRank())
+    return genAssumedRankBoundInquiry(builder, loc, resultType, args,
+                                      /*kindPos=*/1, fir::runtime::genShape);
   int rank = array.rank();
-  if (rank == 0)
-    TODO(loc, "shape intrinsic lowering with assumed-rank source");
   mlir::Type indexType = builder.getIndexType();
   mlir::Type extentType = fir::unwrapSequenceType(resultType);
   mlir::Type seqType = fir::SequenceType::get(
diff --git a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
index 16f63be..34c4020 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
@@ -87,3 +87,17 @@ mlir::Value fir::runtime::genIsContiguous(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, fTy, array);
   return builder.create<fir::CallOp>(loc, isContiguousFunc, args).getResult(0);
 }
+
+void fir::runtime::genShape(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value resultAddr, mlir::Value array,
+                            mlir::Value kind) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(Shape)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
+  auto args = fir::runtime::createArguments(
+      builder, loc, fTy, resultAddr, array, kind, sourceFile, sourceLine);
+  builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
diff --git a/flang/lib/Optimizer/Transforms/FunctionAttr.cpp b/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
index f54080f..69c4159 100644
--- a/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
+++ b/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 
 namespace fir {
-#define GEN_PASS_DECL_FUNCTIONATTR
 #define GEN_PASS_DEF_FUNCTIONATTR
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 } // namespace fir
@@ -76,22 +75,3 @@ void FunctionAttrPass::runOnOperation() {
 
   LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
 }
-
-std::unique_ptr<mlir::Pass> fir::createFunctionAttrPass(
-    fir::FunctionAttrTypes &functionAttr, bool noInfsFPMath, bool noNaNsFPMath,
-    bool approxFuncFPMath, bool noSignedZerosFPMath, bool unsafeFPMath) {
-  FunctionAttrOptions opts;
-  // Frame pointer
-  opts.framePointerKind = functionAttr.framePointerKind;
-  opts.noInfsFPMath = noInfsFPMath;
-  opts.noNaNsFPMath = noNaNsFPMath;
-  opts.approxFuncFPMath = approxFuncFPMath;
-  opts.noSignedZerosFPMath = noSignedZerosFPMath;
-  opts.unsafeFPMath = unsafeFPMath;
-
-  return std::make_unique<FunctionAttrPass>(opts);
-}
-
-std::unique_ptr<mlir::Pass> fir::createFunctionAttrPass() {
-  return std::make_unique<FunctionAttrPass>();
-}
diff --git a/flang/runtime/inquiry.cpp b/flang/runtime/inquiry.cpp
index ea11417..5ffd975 100644
--- a/flang/runtime/inquiry.cpp
+++ b/flang/runtime/inquiry.cpp
@@ -85,8 +85,9 @@ std::int64_t RTDEF(SizeDim)(
   return static_cast<std::int64_t>(dimension.Extent());
 }
 
-void RTDEF(Shape)(void *result, const Descriptor &array, int kind) {
-  Terminator terminator{__FILE__, __LINE__};
+void RTDEF(Shape)(void *result, const Descriptor &array, int kind,
+    const char *sourceFile, int line) {
+  Terminator terminator{sourceFile, line};
   INTERNAL_CHECK(array.rank() <= common::maxRank);
   for (SubscriptValue i{0}; i < array.rank(); ++i) {
     const Dimension &dimension{array.GetDimension(i)};
@@ -95,5 +96,16 @@ void RTDEF(Shape)(void *result, const Descriptor &array, int kind) {
   }
 }
 
+void RTDEF(Lbound)(void *result, const Descriptor &array, int kind,
+    const char *sourceFile, int line) {
+  Terminator terminator{sourceFile, line};
+  INTERNAL_CHECK(array.rank() <= common::maxRank);
+  for (SubscriptValue i{0}; i < array.rank(); ++i) {
+    const Dimension &dimension{array.GetDimension(i)};
+    Fortran::runtime::ApplyIntegerKind<RawStoreIntegerAt, void>(
+        kind, terminator, result, i, dimension.LowerBound());
+  }
+}
+
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90
index 353e944..6c8f5ba 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90
@@ -29,7 +29,7 @@ end subroutine
 ! CHECK-SAME:                              %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_size_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.call @_FortranASize(%[[VAL_5]]
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> i32
 ! CHECK:           %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
@@ -49,13 +49,13 @@ end subroutine
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64
 ! CHECK:           %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (i32) {
-! CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
 ! CHECK:             %[[VAL_13:.*]] = fir.call @_FortranASize(%[[VAL_11]]
 ! CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i64) -> i32
 ! CHECK:             fir.result %[[VAL_14]] : i32
 ! CHECK:           } else {
 ! CHECK:             %[[VAL_15:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
 ! CHECK:             %[[VAL_20:.*]] = fir.call @_FortranASizeDim(%[[VAL_18]]
 ! CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i64) -> i32
 ! CHECK:             fir.result %[[VAL_21]] : i32
@@ -76,13 +76,13 @@ end subroutine
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64
 ! CHECK:           %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (i32) {
-! CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
 ! CHECK:             %[[VAL_13:.*]] = fir.call @_FortranASize(%[[VAL_11]],
 ! CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i64) -> i32
 ! CHECK:             fir.result %[[VAL_14]] : i32
 ! CHECK:           } else {
 ! CHECK:             %[[VAL_15:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<i32>
-! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
 ! CHECK:             %[[VAL_20:.*]] = fir.call @_FortranASizeDim(%[[VAL_18]]
 ! CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i64) -> i32
 ! CHECK:             fir.result %[[VAL_21]] : i32
@@ -97,7 +97,7 @@ end subroutine
 ! CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_size_4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranASize(%[[VAL_6]]
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i64) -> i32
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
new file mode 100644
index 0000000..bbeff5f
--- /dev/null
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
@@ -0,0 +1,56 @@
+! Test shape lowering for assumed-rank
+! RUN: bbc -emit-hlfir -o - %s -allow-assumed-rank | FileCheck %s
+
+subroutine test_shape(x)
+  real :: x(..)
+  call takes_integer_array(shape(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_shape(
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<15xi32>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 4 : i32
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAShape(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
+! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_13]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_15:.*]] = arith.constant false
+! CHECK:           %[[VAL_16:.*]] = hlfir.as_expr %[[VAL_14]]#0 move %[[VAL_15]] : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
+! CHECK:           %[[VAL_17:.*]]:3 = hlfir.associate %[[VAL_16]](%[[VAL_13]]) {adapt.valuebyref} : (!hlfir.expr<?xi32>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>, i1)
+! CHECK:           fir.call @_QPtakes_integer_array(%[[VAL_17]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_17]]#1, %[[VAL_17]]#2 : !fir.ref<!fir.array<?xi32>>, i1
+! CHECK:           hlfir.destroy %[[VAL_16]] : !hlfir.expr<?xi32>
+! CHECK:           return
+! CHECK:         }
+
+subroutine test_shape_kind(x)
+  real :: x(..)
+  call takes_integer8_array(shape(x, kind=8))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_shape_kind(
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<15xi64>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i32
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.llvm_ptr<i8>
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAShape(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.ref<!fir.array<?xi64>>
+! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
+! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_13]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.array<?xi64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi64>>, !fir.ref<!fir.array<?xi64>>)
+
+subroutine test_shape_2(x)
+  real, pointer :: x(..)
+  call takes_integer_array(shape(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_shape_2(
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<15xi32>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : i32
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
+! CHECK:           %[[VAL_11:.*]] = fir.call @_FortranAShape(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:           %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> index
+! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_14]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
index e8610aa..a1d150a 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
@@ -99,7 +99,7 @@ end subroutine
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatedEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.heap<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.array<*:f32>>) -> i64
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
@@ -115,7 +115,7 @@ end subroutine
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_1Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.ptr<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:f32>>) -> i64
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
@@ -133,7 +133,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_associated_2Ey"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
@@ -150,8 +150,8 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ey"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
-! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_7]], %[[VAL_8]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
@@ -166,7 +166,7 @@ end subroutine
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_len_1Ex"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#1 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#0 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> index
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (index) -> i32
 ! CHECK:           %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 ! CHECK:           fir.call @_QPtakes_integer(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
@@ -191,7 +191,7 @@ end subroutine
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_storage_size_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
-! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#1 : (!fir.class<!fir.array<*:none>>) -> i32
+! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i32
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i32
 ! CHECK:           %[[VAL_5:.*]] = arith.muli %[[VAL_3]], %[[VAL_4]] : i32
 ! CHECK:           %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
@@ -204,7 +204,7 @@ end subroutine
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_storage_size_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.ptr<!fir.array<*:none>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:none>>) -> i64
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
@@ -212,7 +212,7 @@ end subroutine
 ! CHECK:           fir.if %[[VAL_7]] {
 ! CHECK:             %[[VAL_13:.*]] = fir.call @_FortranAReportFatalUserError
 ! CHECK:           }
-! CHECK:           %[[VAL_14:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
+! CHECK:           %[[VAL_14:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.box_elesize %[[VAL_14]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> i32
 ! CHECK:           %[[VAL_16:.*]] = arith.constant 8 : i32
 ! CHECK:           %[[VAL_17:.*]] = arith.muli %[[VAL_15]], %[[VAL_16]] : i32
@@ -226,7 +226,7 @@ end subroutine
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_present_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
-! CHECK:           %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.class<!fir.array<*:none>>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i1
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
 ! CHECK:           %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1)
 ! CHECK:           fir.call @_QPtakes_logical(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> ()
@@ -238,7 +238,7 @@ end subroutine
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QFtest_present_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) -> i1
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
 ! CHECK:           %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1)
 ! CHECK:           fir.call @_QPtakes_logical(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> ()
@@ -250,7 +250,7 @@ end subroutine
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_is_contiguous_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_4:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> i1
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4>
 ! CHECK:           %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1)
@@ -263,7 +263,7 @@ end subroutine
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_is_contiguous_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_5:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_4]]) fastmath<contract> : (!fir.box<none>) -> i1
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
@@ -279,8 +279,8 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.call @_FortranASameTypeAs(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
 ! CHECK:           %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1)
@@ -295,8 +295,8 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
-! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranASameTypeAs(%[[VAL_7]], %[[VAL_8]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
@@ -313,8 +313,8 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.call @_FortranAExtendsTypeOf(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
 ! CHECK:           %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1)
@@ -329,8 +329,8 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
-! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAExtendsTypeOf(%[[VAL_7]], %[[VAL_8]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
@@ -348,7 +348,7 @@ end subroutine
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
-! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.array<*:f32>>) -> i64
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<i64>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
@@ -365,7 +365,7 @@ end subroutine
 ! CHECK-SAME:                          %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFc_loc_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:           %[[VAL_5:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:           %[[VAL_6:.*]] = fir.coordinate_of %[[VAL_4]], %[[VAL_5]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
diff --git a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
index cd65696..3b60b0d 100644
--- a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
+++ b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
@@ -103,7 +103,7 @@ end subroutine
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable, intent_out>, uniq_name = "_QMassumed_rank_testsFtest_intentoutEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.heap<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.array<*:f32>>) -> i64
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
@@ -111,7 +111,7 @@ end subroutine
 ! CHECK:           fir.if %[[VAL_7]] {
 ! CHECK:             %[[VAL_8:.*]] = arith.constant false
 ! CHECK:             %[[VAL_9:.*]] = fir.absent !fir.box<none>
-! CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_2]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:             %[[VAL_14:.*]] = fir.call @_FortranAAllocatableDeallocate(%[[VAL_12]], %[[VAL_8]], %[[VAL_9]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           }
 ! CHECK:           return
diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90
index 8498753..211b756 100644
--- a/flang/test/Lower/HLFIR/select-rank.f90
+++ b/flang/test/Lower/HLFIR/select-rank.f90
@@ -319,7 +319,7 @@ end subroutine
 ! CHECK:           %[[VAL_6:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_6]], ^bb1, ^bb2
 ! CHECK:         ^bb1:
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {uniq_name = "_QFtest_simple_caseEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {uniq_name = "_QFtest_simple_caseEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_7]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb2:
@@ -362,7 +362,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr2(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<?x?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb3:
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {uniq_name = "_QFtest_rank_starEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {uniq_name = "_QFtest_rank_starEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_9]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb4:
@@ -431,7 +431,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr2(%[[VAL_7]]#0) fastmath<contract> : (!fir.box<!fir.array<?x?xf32>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb3:
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {fortran_attrs = #fir.var_attrs<asynchronous, target>, uniq_name = "_QFtest_rank_star_attributesEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {fortran_attrs = #fir.var_attrs<asynchronous, target>, uniq_name = "_QFtest_rank_star_attributesEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb4:
@@ -469,7 +469,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr2_implicit(%[[VAL_14]]#1) fastmath<contract> : (!fir.ref<!fir.array<?x?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb3:
-! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_rank_star_contiguousEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_rank_star_contiguousEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_15]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb4:
@@ -518,7 +518,7 @@ end subroutine
 ! CHECK:           fir.call @_QPrc0_implicit(%[[VAL_16]]#0) fastmath<contract> : (!fir.boxchar<1>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb3:
-! CHECK:           %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_8]]#1 typeparams %[[VAL_7]] {uniq_name = "_QFtest_rank_star_contiguous_characterEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_8]]#0 typeparams %[[VAL_7]] {uniq_name = "_QFtest_rank_star_contiguous_characterEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
 ! CHECK:           fir.call @_QPrcdefault(%[[VAL_17]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb4:
@@ -568,7 +568,7 @@ end subroutine
 ! CHECK:           fir.call @_QPra0(%[[VAL_10]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb3:
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_simple_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_simple_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
 ! CHECK:           fir.call @_QPradefault(%[[VAL_11]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb4:
@@ -588,7 +588,7 @@ end subroutine
 ! CHECK:           %[[VAL_4:.*]] = fir.box_rank %[[VAL_2]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8
 ! CHECK:           fir.select_case %[[VAL_4]] : i8 [#fir.point, %[[VAL_3]], ^bb2, unit, ^bb1]
 ! CHECK:         ^bb1:
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_character_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_character_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           cf.br ^bb3
 ! CHECK:         ^bb2:
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
@@ -614,7 +614,7 @@ end subroutine
 ! CHECK:           %[[VAL_10:.*]] = fir.box_rank %[[VAL_8]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8
 ! CHECK:           fir.select_case %[[VAL_10]] : i8 [#fir.point, %[[VAL_9]], ^bb2, unit, ^bb1]
 ! CHECK:         ^bb1:
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_8]]#1 typeparams %[[VAL_7]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, i64) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_8]]#0 typeparams %[[VAL_7]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, i64) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           cf.br ^bb3
 ! CHECK:         ^bb2:
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -634,7 +634,7 @@ end subroutine
 ! CHECK:           %[[VAL_6:.*]] = fir.box_rank %[[VAL_4]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8
 ! CHECK:           fir.select_case %[[VAL_6]] : i8 [#fir.point, %[[VAL_5]], ^bb2, unit, ^bb1]
 ! CHECK:         ^bb1:
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]]#1 typeparams %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_assumed_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_assumed_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           cf.br ^bb3
 ! CHECK:         ^bb2:
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -661,7 +661,7 @@ end subroutine
 ! CHECK:           fir.call @_QPrup1(%[[VAL_8]]#0) fastmath<contract> : (!fir.class<!fir.array<?xnone>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb3:
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {uniq_name = "_QFtest_polymorphicEx"} : (!fir.class<!fir.array<*:none>>) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {uniq_name = "_QFtest_polymorphicEx"} : (!fir.class<!fir.array<*:none>>) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           fir.call @_QPrupdefault(%[[VAL_9]]#0) fastmath<contract> : (!fir.class<!fir.array<*:none>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb4:
@@ -711,7 +711,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr1(%[[VAL_20]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb7
 ! CHECK:         ^bb6:
-! CHECK:           %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_4]]#1 {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_4]]#0 {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPr0(%[[VAL_11]]#1) fastmath<contract> : (!fir.ref<f32>) -> ()
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_21]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb7
@@ -741,14 +741,14 @@ end subroutine
 ! CHECK:           fir.call @_QPr1(%[[VAL_32]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb13
 ! CHECK:         ^bb12:
-! CHECK:           %[[VAL_33:.*]]:2 = hlfir.declare %[[VAL_4]]#1 {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_33:.*]]:2 = hlfir.declare %[[VAL_4]]#0 {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPr1(%[[VAL_23]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_33]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb13
 ! CHECK:         ^bb13:
 ! CHECK:           cf.br ^bb20
 ! CHECK:         ^bb14:
-! CHECK:           %[[VAL_34:.*]]:2 = hlfir.declare %[[VAL_3]]#1 {uniq_name = "_QFtest_nested_select_rankEx1"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_34:.*]]:2 = hlfir.declare %[[VAL_3]]#0 {uniq_name = "_QFtest_nested_select_rankEx1"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_35:.*]] = arith.constant 0 : i8
 ! CHECK:           %[[VAL_36:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_37:.*]] = fir.is_assumed_size %[[VAL_4]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
@@ -770,7 +770,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr1(%[[VAL_43]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb19
 ! CHECK:         ^bb18:
-! CHECK:           %[[VAL_44:.*]]:2 = hlfir.declare %[[VAL_4]]#1 {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_44:.*]]:2 = hlfir.declare %[[VAL_4]]#0 {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_34]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_44]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb19
@@ -789,7 +789,7 @@ end subroutine
 ! CHECK:           %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_5]], ^bb1, ^bb2
 ! CHECK:         ^bb1:
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {uniq_name = "_QFtest_branchingEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]]#0 {uniq_name = "_QFtest_branchingEx"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_7:.*]] = fir.call @_QPjump() fastmath<contract> : () -> !fir.logical<4>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.logical<4>) -> i1
 ! CHECK:           %[[VAL_9:.*]] = arith.constant true
diff --git a/flang/test/Lower/OpenMP/function-filtering-2.f90 b/flang/test/Lower/OpenMP/function-filtering-2.f90
index e1d0f72..f367069 100644
--- a/flang/test/Lower/OpenMP/function-filtering-2.f90
+++ b/flang/test/Lower/OpenMP/function-filtering-2.f90
@@ -1,6 +1,6 @@
-! RUN: %flang_fc1 -fopenmp -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefix=LLVM %s
+! RUN: %flang_fc1 -fopenmp -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-HOST %s
 ! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefix=LLVM %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-DEVICE %s
 ! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
 ! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
 ! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
@@ -38,7 +38,7 @@ end subroutine no_declaretarget
 ! MLIR-ALL: return
 
 ! LLVM-HOST: define {{.*}} @{{.*}}main{{.*}}(
-! LLVM-HOST-NOT: {{.*}} @{{.*}}__omp_offloading{{.*}}main_{{.*}}(
+! LLVM-HOST: {{.*}} @{{.*}}__omp_offloading{{.*}}main_{{.*}}(
 ! LLVM-DEVICE-NOT: {{.*}} @{{.*}}main{{.*}}(
 ! LLVM-DEVICE: define {{.*}} @{{.*}}__omp_offloading{{.*}}main_{{.*}}(
 program main
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
new file mode 100644
index 0000000..2e62ee4
--- /dev/null
+++ b/flang/test/Lower/loops3.f90
@@ -0,0 +1,23 @@
+! Test do concurrent reduction
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+
+! CHECK-LABEL: loop_test
+subroutine loop_test
+  integer(4) :: i, j, k, tmp, sum = 0
+  real :: m
+
+  i = 100
+  j = 200
+  k = 300
+
+  ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"}
+  ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref<i32>
+  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered {
+  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered {
+  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
+  do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m)
+    tmp = i + j + k
+    sum = tmp + sum
+    m = max(m, sum)
+  enddo
+end subroutine loop_test
diff --git a/flang/test/Transforms/debug-local-var-2.f90 b/flang/test/Transforms/debug-local-var-2.f90
index 0fe1b81..ee60a07 100644
--- a/flang/test/Transforms/debug-local-var-2.f90
+++ b/flang/test/Transforms/debug-local-var-2.f90
@@ -20,20 +20,20 @@
 
 ! CHECK-LABEL: define {{.*}}i64 @_QFPfn1
 ! CHECK-SAME: (ptr %[[ARG1:.*]], ptr %[[ARG2:.*]], ptr %[[ARG3:.*]])
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression())
 ! CHECK-DAG: %[[AL2:.*]] = alloca i64
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression())
 ! CHECK-LABEL: }
 
 ! CHECK-LABEL: define {{.*}}i32 @_QFPfn2
 ! CHECK-SAME: (ptr %[[FN2ARG1:.*]], ptr %[[FN2ARG2:.*]], ptr %[[FN2ARG3:.*]])
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression())
 ! CHECK-DAG: %[[AL3:.*]] = alloca i32
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression())
 ! CHECK-LABEL: }
 
 program mn
diff --git a/flang/unittests/Runtime/Inquiry.cpp b/flang/unittests/Runtime/Inquiry.cpp
index 665a930..98a350d 100644
--- a/flang/unittests/Runtime/Inquiry.cpp
+++ b/flang/unittests/Runtime/Inquiry.cpp
@@ -14,7 +14,7 @@
 using namespace Fortran::runtime;
 using Fortran::common::TypeCategory;
 
-TEST(Inquiry, Lbound) {
+TEST(Inquiry, LboundDim) {
   // ARRAY  1 3 5
   //        2 4 6
   auto array{MakeArray<TypeCategory::Integer, 4>(
@@ -26,6 +26,42 @@ TEST(Inquiry, Lbound) {
   EXPECT_EQ(RTNAME(LboundDim)(*array, 2, __FILE__, __LINE__), std::int64_t{-1});
 }
 
+TEST(Inquiry, Lbound) {
+  // ARRAY  1 3 5
+  //        2 4 6
+  auto array{MakeArray<TypeCategory::Integer, 4>(
+      std::vector<int>{2, 3}, std::vector<std::int32_t>{1, 2, 3, 4, 5, 6})};
+  array->GetDimension(0).SetLowerBound(0);
+  array->GetDimension(1).SetLowerBound(-1);
+
+  // LBOUND(ARRAY, KIND=1)
+  auto int8Result{
+      MakeArray<TypeCategory::Integer, 1>(std::vector<int>{array->rank()},
+          std::vector<std::int8_t>(array->rank(), 0))};
+  RTNAME(Lbound)
+  (int8Result->raw().base_addr, *array, /*KIND=*/1, __FILE__, __LINE__);
+  EXPECT_EQ(*int8Result->ZeroBasedIndexedElement<std::int8_t>(0), 0);
+  EXPECT_EQ(*int8Result->ZeroBasedIndexedElement<std::int8_t>(1), -1);
+
+  // LBOUND(ARRAY, KIND=4)
+  auto int32Result{
+      MakeArray<TypeCategory::Integer, 4>(std::vector<int>{array->rank()},
+          std::vector<std::int32_t>(array->rank(), 0))};
+  RTNAME(Lbound)
+  (int32Result->raw().base_addr, *array, /*KIND=*/4, __FILE__, __LINE__);
+  EXPECT_EQ(*int32Result->ZeroBasedIndexedElement<std::int32_t>(0), 0);
+  EXPECT_EQ(*int32Result->ZeroBasedIndexedElement<std::int32_t>(1), -1);
+
+  // LBOUND(ARRAY, KIND=8)
+  auto int64Result{
+      MakeArray<TypeCategory::Integer, 8>(std::vector<int>{array->rank()},
+          std::vector<std::int64_t>(array->rank(), 0))};
+  RTNAME(Lbound)
+  (int64Result->raw().base_addr, *array, /*KIND=*/8, __FILE__, __LINE__);
+  EXPECT_EQ(*int64Result->ZeroBasedIndexedElement<std::int64_t>(0), 0);
+  EXPECT_EQ(*int64Result->ZeroBasedIndexedElement<std::int64_t>(1), -1);
+}
+
 TEST(Inquiry, Ubound) {
   // ARRAY  1 3 5
   //        2 4 6
@@ -87,7 +123,8 @@ TEST(Inquiry, Shape) {
   auto int8Result{
       MakeArray<TypeCategory::Integer, 1>(std::vector<int>{array->rank()},
           std::vector<std::int8_t>(array->rank(), 0))};
-  RTNAME(Shape)(int8Result->raw().base_addr, *array, /*KIND=*/1);
+  RTNAME(Shape)
+  (int8Result->raw().base_addr, *array, /*KIND=*/1, __FILE__, __LINE__);
   EXPECT_EQ(*int8Result->ZeroBasedIndexedElement<std::int8_t>(0), 2);
   EXPECT_EQ(*int8Result->ZeroBasedIndexedElement<std::int8_t>(1), 3);
 
@@ -95,7 +132,8 @@ TEST(Inquiry, Shape) {
   auto int32Result{
       MakeArray<TypeCategory::Integer, 4>(std::vector<int>{array->rank()},
           std::vector<std::int32_t>(array->rank(), 0))};
-  RTNAME(Shape)(int32Result->raw().base_addr, *array, /*KIND=*/4);
+  RTNAME(Shape)
+  (int32Result->raw().base_addr, *array, /*KIND=*/4, __FILE__, __LINE__);
   EXPECT_EQ(*int32Result->ZeroBasedIndexedElement<std::int32_t>(0), 2);
   EXPECT_EQ(*int32Result->ZeroBasedIndexedElement<std::int32_t>(1), 3);
 
@@ -103,7 +141,8 @@ TEST(Inquiry, Shape) {
   auto int64Result{
       MakeArray<TypeCategory::Integer, 8>(std::vector<int>{array->rank()},
           std::vector<std::int64_t>(array->rank(), 0))};
-  RTNAME(Shape)(int64Result->raw().base_addr, *array, /*KIND=*/8);
+  RTNAME(Shape)
+  (int64Result->raw().base_addr, *array, /*KIND=*/8, __FILE__, __LINE__);
   EXPECT_EQ(*int64Result->ZeroBasedIndexedElement<std::int64_t>(0), 2);
   EXPECT_EQ(*int64Result->ZeroBasedIndexedElement<std::int64_t>(1), 3);
 }
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 33ecff8..381061ce3 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -394,6 +394,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
+    libc.src.math.fmul
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
@@ -515,12 +516,18 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.fminimum_magf16
     libc.src.math.fminimum_mag_numf16
     libc.src.math.fminimum_numf16
+    libc.src.math.frexpf16
     libc.src.math.fromfpf16
     libc.src.math.fromfpxf16
+    libc.src.math.ilogbf16
+    libc.src.math.llogbf16
     libc.src.math.llrintf16
     libc.src.math.llroundf16
+    libc.src.math.logbf16
     libc.src.math.lrintf16
     libc.src.math.lroundf16
+    # libc.src.math.modff16
+    libc.src.math.nanf16
     libc.src.math.nearbyintf16
     libc.src.math.nextafterf16
     libc.src.math.nextdownf16
@@ -529,6 +536,8 @@ if(LIBC_TYPES_HAS_FLOAT16)
     # clang-12 and after: https://godbolt.org/z/8ceT9454c
     # libc.src.math.nexttowardf16
     libc.src.math.nextupf16
+    libc.src.math.remainderf16
+    libc.src.math.remquof16
     libc.src.math.rintf16
     libc.src.math.roundf16
     libc.src.math.roundevenf16
@@ -574,6 +583,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.nextafterf128
     libc.src.math.nextdownf128
     libc.src.math.nextupf128
+    libc.src.math.remquof128
     libc.src.math.rintf128
     libc.src.math.roundf128
     libc.src.math.scalbnf128
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 335981f..d4f9324 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -261,6 +261,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
+    libc.src.math.fmul
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.frexp
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 479af40..67abf851 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -402,6 +402,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
+    libc.src.math.fmul
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 1beca7e..e99960b 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -421,6 +421,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
+    libc.src.math.fmul
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
@@ -548,17 +549,25 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.fminimum_mag_numf16
     libc.src.math.fminimum_numf16
     libc.src.math.fmodf16
+    libc.src.math.frexpf16
     libc.src.math.fromfpf16
     libc.src.math.fromfpxf16
+    libc.src.math.ilogbf16
+    libc.src.math.llogbf16
     libc.src.math.llrintf16
     libc.src.math.llroundf16
+    libc.src.math.logbf16
     libc.src.math.lrintf16
     libc.src.math.lroundf16
+    libc.src.math.modff16
+    libc.src.math.nanf16
     libc.src.math.nearbyintf16
     libc.src.math.nextafterf16
     libc.src.math.nextdownf16
     libc.src.math.nexttowardf16
     libc.src.math.nextupf16
+    libc.src.math.remainderf16
+    libc.src.math.remquof16
     libc.src.math.rintf16
     libc.src.math.roundf16
     libc.src.math.roundevenf16
@@ -605,6 +614,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.nextafterf128
     libc.src.math.nextdownf128
     libc.src.math.nextupf128
+    libc.src.math.remquof128
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 7121653..a489872 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -180,6 +180,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fminimum_mag_num
     libc.src.math.fminimum_mag_numf
     libc.src.math.fminimum_mag_numl
+    libc.src.math.fmul
     libc.src.math.fmod
     libc.src.math.fmodf
     libc.src.math.fmodl
diff --git a/libc/docs/c23.rst b/libc/docs/c23.rst
index 71232cc..fec9b24 100644
--- a/libc/docs/c23.rst
+++ b/libc/docs/c23.rst
@@ -50,7 +50,7 @@ Additions:
   * issignaling
   * issubnormal
   * iszero
-  * llogb*
+  * llogb* |check|
   * pown*
   * powr*
   * rootn*
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 24b88a52..f83a646 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -158,9 +158,9 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fmod             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.10.1              | F.10.7.1                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| fmul             | N/A              |                 |                        | N/A                  |                        | 7.12.14.3              | F.10.11                    |
+| fmul             | N/A              | |check|         |                        | N/A                  |                        | 7.12.14.3              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| frexp            | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.7               | F.10.3.7                   |
+| frexp            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.7               | F.10.3.7                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fromfp           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.10              | F.10.6.10                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -168,25 +168,25 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fsub             | N/A              |                 |                        | N/A                  |                        | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| ilogb            | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.8               | F.10.3.8                   |
+| ilogb            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.8               | F.10.3.8                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | ldexp            | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.9               | F.10.3.9                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| llogb            | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.10              | F.10.3.10                  |
+| llogb            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.10              | F.10.3.10                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | llrint           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.5               | F.10.6.5                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | llround          | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.7               | F.10.6.7                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| logb             | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.17              | F.10.3.17                  |
+| logb             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.17              | F.10.3.17                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | lrint            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.5               | F.10.6.5                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | lround           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.7               | F.10.6.7                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| modf             | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.18              | F.10.3.18                  |
+| modf             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.18              | F.10.3.18                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| nan              | |check|          | |check|         | |check|                |                      | |check|                | 7.12.11.2              | F.10.8.2                   |
+| nan              | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.11.2              | F.10.8.2                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | nearbyint        | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.3               | F.10.6.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -198,9 +198,9 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | nextup           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.11.5              | F.10.8.5                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| remainder        | |check|          | |check|         | |check|                |                      |                        | 7.12.10.2              | F.10.7.2                   |
+| remainder        | |check|          | |check|         | |check|                | |check|              |                        | 7.12.10.2              | F.10.7.2                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| remquo           | |check|          | |check|         | |check|                |                      |                        | 7.12.10.3              | F.10.7.3                   |
+| remquo           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.10.3              | F.10.7.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | rint             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.4               | F.10.6.4                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/include/llvm-libc-macros/float16-macros.h b/libc/include/llvm-libc-macros/float16-macros.h
index 9f17503..3f819ad 100644
--- a/libc/include/llvm-libc-macros/float16-macros.h
+++ b/libc/include/llvm-libc-macros/float16-macros.h
@@ -10,7 +10,8 @@
 #define LLVM_LIBC_MACROS_FLOAT16_MACROS_H
 
 #if defined(__FLT16_MANT_DIG__) &&                                             \
-    (!defined(__GNUC__) || __GNUC__ >= 13 || defined(__clang__))
+    (!defined(__GNUC__) || __GNUC__ >= 13 || defined(__clang__)) &&            \
+    !defined(__riscv)
 #define LIBC_TYPES_HAS_FLOAT16
 #endif
 
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index ee2c910..356ac03 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -58,7 +58,7 @@ add_header(pthread_rwlockattr_t HDR pthread_rwlockattr_t.h)
 add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type)
 add_header(rlim_t HDR rlim_t.h)
 add_header(time_t HDR time_t.h)
-add_header(stack_t HDR stack_t.h)
+add_header(stack_t HDR stack_t.h DEPENDS .size_t)
 add_header(suseconds_t HDR suseconds_t.h)
 add_header(struct_flock HDR struct_flock.h DEPENDS .off_t .pid_t)
 add_header(struct_flock64 HDR struct_flock64.h DEPENDS .off64_t .pid_t)
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 056a314..7e1283e 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -115,6 +115,7 @@ def IntPtr : PtrType<IntType>;
 def RestrictedIntPtr : RestrictedPtrType<IntType>;
 def FloatPtr : PtrType<FloatType>;
 def DoublePtr : PtrType<DoubleType>;
+def Float16Ptr : PtrType<Float16Type>;
 def Float128Ptr : PtrType<Float128Type>;
 def UnsignedCharPtr : PtrType<UnsignedCharType>;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 6443914..3416994 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -472,6 +472,9 @@ def StdC : StandardSpec<"stdc"> {
           GuardedFunctionSpec<"fminimum_mag_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"fminimum_mag_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
+          FunctionSpec<"fmul", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
+
+
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
@@ -484,6 +487,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
+          GuardedFunctionSpec<"frexpf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"frexpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"fromfp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
@@ -516,11 +520,13 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"ilogb", RetValSpec<IntType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"ilogbf", RetValSpec<IntType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"ilogbl", RetValSpec<IntType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"ilogbf16", RetValSpec<IntType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"ilogbf128", RetValSpec<IntType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"llogb", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"llogbf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"llogbl", RetValSpec<LongType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"llogbf16", RetValSpec<LongType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"llogbf128", RetValSpec<LongType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"ldexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
@@ -543,11 +549,13 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"logb", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"logbf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"logbl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"logbf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"logbf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"modf", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoublePtr>]>,
           FunctionSpec<"modff", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatPtr>]>,
           FunctionSpec<"modfl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoublePtr>]>,
+          GuardedFunctionSpec<"modff16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"modff128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"cos", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
@@ -573,13 +581,16 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"exp10", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"exp10f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
-          FunctionSpec<"remainderf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"remainder", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
+          FunctionSpec<"remainderf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"remainderl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"remainderf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
-          FunctionSpec<"remquof", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"remquo", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
+          FunctionSpec<"remquof", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"remquol", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
+          GuardedFunctionSpec<"remquof16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"remquof128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"round", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"roundf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
@@ -691,6 +702,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"nanf", RetValSpec<FloatType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"nan", RetValSpec<DoubleType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"nanl", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharPtr>]>,
+          GuardedFunctionSpec<"nanf16", RetValSpec<Float16Type>, [ArgSpec<ConstCharPtr>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"nanf128", RetValSpec<Float128Type>, [ArgSpec<ConstCharPtr>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"canonicalize", RetValSpec<IntType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index 8bc1fec..413d2043 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -52,7 +52,7 @@ template <typename T> struct NormalFloat {
       return;
 
     unsigned normalization_shift = evaluate_normalization_shift(mantissa);
-    mantissa = mantissa << normalization_shift;
+    mantissa <<= normalization_shift;
     exponent -= normalization_shift;
   }
 
@@ -110,9 +110,11 @@ template <typename T> struct NormalFloat {
       if (shift <= FPBits<T>::FRACTION_LEN + 1) {
         // Generate a subnormal number. Might lead to loss of precision.
         // We round to nearest and round halfway cases to even.
-        const StorageType shift_out_mask = (StorageType(1) << shift) - 1;
+        const StorageType shift_out_mask =
+            static_cast<StorageType>(StorageType(1) << shift) - 1;
         const StorageType shift_out_value = mantissa & shift_out_mask;
-        const StorageType halfway_value = StorageType(1) << (shift - 1);
+        const StorageType halfway_value =
+            static_cast<StorageType>(StorageType(1) << (shift - 1));
         result.set_biased_exponent(0);
         result.set_mantissa(mantissa >> shift);
         StorageType new_mantissa = result.get_mantissa();
@@ -135,7 +137,8 @@ template <typename T> struct NormalFloat {
       }
     }
 
-    result.set_biased_exponent(exponent + FPBits<T>::EXP_BIAS);
+    result.set_biased_exponent(
+        static_cast<StorageType>(exponent + FPBits<T>::EXP_BIAS));
     result.set_mantissa(mantissa);
     return result.get_val();
   }
@@ -155,7 +158,7 @@ private:
     // Normalize subnormal numbers.
     if (bits.is_subnormal()) {
       unsigned shift = evaluate_normalization_shift(bits.get_mantissa());
-      mantissa = StorageType(bits.get_mantissa()) << shift;
+      mantissa = static_cast<StorageType>(bits.get_mantissa() << shift);
       exponent = 1 - FPBits<T>::EXP_BIAS - shift;
     } else {
       exponent = bits.get_biased_exponent() - FPBits<T>::EXP_BIAS;
diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index e2061c4..40ad6ee 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -299,7 +299,8 @@ LIBC_INLINE constexpr cpp::array<word, N> shift(cpp::array<word, N> array,
     if (bit_offset == 0)
       dst = part1; // no crosstalk between parts.
     else if constexpr (direction == LEFT)
-      dst = (part1 << bit_offset) | (part2 >> (WORD_BITS - bit_offset));
+      dst = static_cast<word>((part1 << bit_offset) |
+                              (part2 >> (WORD_BITS - bit_offset)));
     else
       dst = (part1 >> bit_offset) | (part2 << (WORD_BITS - bit_offset));
   }
@@ -969,7 +970,8 @@ struct WordTypeSelector : cpp::type_identity<
 #endif // LIBC_TYPES_HAS_INT64
                               > {
 };
-// Except if we request 32 bits explicitly.
+// Except if we request 16 or 32 bits explicitly.
+template <> struct WordTypeSelector<16> : cpp::type_identity<uint16_t> {};
 template <> struct WordTypeSelector<32> : cpp::type_identity<uint32_t> {};
 template <size_t Bits>
 using WordTypeSelectorT = typename WordTypeSelector<Bits>::type;
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 141f668..82dfdaf 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -180,6 +180,8 @@ add_math_entrypoint_object(fminimum_mag_numl)
 add_math_entrypoint_object(fminimum_mag_numf16)
 add_math_entrypoint_object(fminimum_mag_numf128)
 
+add_math_entrypoint_object(fmul)
+
 add_math_entrypoint_object(fmod)
 add_math_entrypoint_object(fmodf)
 add_math_entrypoint_object(fmodl)
@@ -189,6 +191,7 @@ add_math_entrypoint_object(fmodf128)
 add_math_entrypoint_object(frexp)
 add_math_entrypoint_object(frexpf)
 add_math_entrypoint_object(frexpl)
+add_math_entrypoint_object(frexpf16)
 add_math_entrypoint_object(frexpf128)
 
 add_math_entrypoint_object(fromfp)
@@ -209,11 +212,13 @@ add_math_entrypoint_object(hypotf)
 add_math_entrypoint_object(ilogb)
 add_math_entrypoint_object(ilogbf)
 add_math_entrypoint_object(ilogbl)
+add_math_entrypoint_object(ilogbf16)
 add_math_entrypoint_object(ilogbf128)
 
 add_math_entrypoint_object(llogb)
 add_math_entrypoint_object(llogbf)
 add_math_entrypoint_object(llogbl)
+add_math_entrypoint_object(llogbf16)
 add_math_entrypoint_object(llogbf128)
 
 add_math_entrypoint_object(ldexp)
@@ -236,6 +241,7 @@ add_math_entrypoint_object(logf)
 add_math_entrypoint_object(logb)
 add_math_entrypoint_object(logbf)
 add_math_entrypoint_object(logbl)
+add_math_entrypoint_object(logbf16)
 add_math_entrypoint_object(logbf128)
 
 add_math_entrypoint_object(llrint)
@@ -265,11 +271,13 @@ add_math_entrypoint_object(lroundf128)
 add_math_entrypoint_object(modf)
 add_math_entrypoint_object(modff)
 add_math_entrypoint_object(modfl)
+add_math_entrypoint_object(modff16)
 add_math_entrypoint_object(modff128)
 
 add_math_entrypoint_object(nan)
 add_math_entrypoint_object(nanf)
 add_math_entrypoint_object(nanl)
+add_math_entrypoint_object(nanf16)
 add_math_entrypoint_object(nanf128)
 
 add_math_entrypoint_object(nearbyint)
@@ -307,10 +315,13 @@ add_math_entrypoint_object(powf)
 add_math_entrypoint_object(remainder)
 add_math_entrypoint_object(remainderf)
 add_math_entrypoint_object(remainderl)
+add_math_entrypoint_object(remainderf16)
 
 add_math_entrypoint_object(remquo)
 add_math_entrypoint_object(remquof)
+add_math_entrypoint_object(remquof128)
 add_math_entrypoint_object(remquol)
+add_math_entrypoint_object(remquof16)
 
 add_math_entrypoint_object(rint)
 add_math_entrypoint_object(rintf)
diff --git a/libc/src/math/fmul.h b/libc/src/math/fmul.h
new file mode 100644
index 0000000..fbc1069
--- /dev/null
+++ b/libc/src/math/fmul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmul --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMUL_H
+#define LLVM_LIBC_SRC_MATH_FMUL_H
+
+namespace LIBC_NAMESPACE {
+
+float fmul(double x, double y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FMUL_H
diff --git a/libc/src/math/frexpf16.h b/libc/src/math/frexpf16.h
new file mode 100644
index 0000000..dc1898c
--- /dev/null
+++ b/libc/src/math/frexpf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for frexpf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FREXPF16_H
+#define LLVM_LIBC_SRC_MATH_FREXPF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 frexpf16(float16 x, int *exp);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FREXPF16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 9c9073c..f4f683e 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1302,6 +1302,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  frexpf16
+  SRCS
+    frexpf16.cpp
+  HDRS
+    ../frexpf16.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
   frexpf128
   SRCS
     frexpf128.cpp
@@ -1351,6 +1364,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  ilogbf16
+  SRCS
+    ilogbf16.cpp
+  HDRS
+    ../ilogbf16.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
   ilogbf128
   SRCS
     ilogbf128.cpp
@@ -1400,6 +1426,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  llogbf16
+  SRCS
+    llogbf16.cpp
+  HDRS
+    ../llogbf16.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
   llogbf128
   SRCS
     llogbf128.cpp
@@ -1673,6 +1712,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  logbf16
+  SRCS
+    logbf16.cpp
+  HDRS
+    ../logbf16.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
   logbf128
   SRCS
     logbf128.cpp
@@ -1681,6 +1733,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1721,6 +1774,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  modff16
+  SRCS
+    modff16.cpp
+  HDRS
+    ../modff16.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   modff128
   SRCS
     modff128.cpp
@@ -2355,6 +2421,22 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  fmul
+  SRCS
+    fmul.cpp
+  HDRS
+    ../fmul.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.uint128
+    libc.src.__support.CPP.bit
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.rounding_mode
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   sqrt
   SRCS
     sqrt.cpp
@@ -2413,7 +2495,19 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.division_and_remainder_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+)
+
+add_entrypoint_object(
+  remquof128
+  SRCS
+    remquof128.cpp
+  HDRS
+    ../remquof128.h
+  DEPENDS
+    libc.src.__support.FPUtil.division_and_remainder_operations
+  COMPILE_OPTIONS
+    -O3
 )
 
 add_entrypoint_object(
@@ -2425,7 +2519,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.division_and_remainder_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2437,7 +2531,20 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.division_and_remainder_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+)
+
+add_entrypoint_object(
+  remquof16
+  SRCS
+    remquof16.cpp
+  HDRS
+    ../remquof16.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.division_and_remainder_operations
+  COMPILE_OPTIONS
+    -O3
 )
 
 add_entrypoint_object(
@@ -2449,7 +2556,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.division_and_remainder_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2461,7 +2568,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.division_and_remainder_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -2473,7 +2580,20 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.division_and_remainder_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+)
+
+add_entrypoint_object(
+  remainderf16
+  SRCS
+    remainderf16.cpp
+  HDRS
+    ../remainderf16.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.division_and_remainder_operations
+  COMPILE_OPTIONS
+    -O3
 )
 
 add_entrypoint_object(
@@ -2604,6 +2724,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  nanf16
+  SRCS
+    nanf16.cpp
+  HDRS
+    ../nanf16.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   nanf128
   SRCS
     nanf128.cpp
diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp
new file mode 100644
index 0000000..40af204
--- /dev/null
+++ b/libc/src/math/generic/fmul.cpp
@@ -0,0 +1,128 @@
+//===-- Implementation of fmul function------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmul.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/uint128.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
+  auto x_bits = fputil::FPBits<double>(x);
+
+  auto y_bits = fputil::FPBits<double>(y);
+
+  auto output_sign = (x_bits.sign() != y_bits.sign()) ? Sign::NEG : Sign::POS;
+
+  if (LIBC_UNLIKELY(x_bits.is_inf_or_nan() || y_bits.is_inf_or_nan() ||
+                    x_bits.is_zero() || y_bits.is_zero())) {
+    if (x_bits.is_nan())
+      return static_cast<float>(x);
+    if (y_bits.is_nan())
+      return static_cast<float>(y);
+    if (x_bits.is_inf())
+      return y_bits.is_zero()
+                 ? fputil::FPBits<float>::quiet_nan().get_val()
+                 : fputil::FPBits<float>::inf(output_sign).get_val();
+    if (y_bits.is_inf())
+      return x_bits.is_zero()
+                 ? fputil::FPBits<float>::quiet_nan().get_val()
+                 : fputil::FPBits<float>::inf(output_sign).get_val();
+    // Now either x or y is zero, and the other one is finite.
+    return fputil::FPBits<float>::zero(output_sign).get_val();
+  }
+
+  uint64_t mx, my;
+
+  // Get mantissa and append the hidden bit if needed.
+  mx = x_bits.get_explicit_mantissa();
+  my = y_bits.get_explicit_mantissa();
+
+  // Get the corresponding biased exponent.
+  int ex = x_bits.get_explicit_exponent();
+  int ey = y_bits.get_explicit_exponent();
+
+  // Count the number of leading zeros of the explicit mantissas.
+  int nx = cpp::countl_zero(mx);
+  int ny = cpp::countl_zero(my);
+  // Shift the leading 1 bit to the most significant bit.
+  mx <<= nx;
+  my <<= ny;
+
+  // Adjust exponent accordingly: If x or y are normal, we will only need to
+  // shift by (exponent length + sign bit = 11 bits. If x or y are denormal, we
+  // will need to shift more than 11 bits.
+  ex -= (nx - 11);
+  ey -= (ny - 11);
+
+  UInt128 product = static_cast<UInt128>(mx) * static_cast<UInt128>(my);
+  int32_t dm1;
+  uint64_t highs, lows;
+  uint64_t g, hight, lowt;
+  uint32_t m;
+  uint32_t b;
+  int c;
+
+  highs = static_cast<uint64_t>(product >> 64);
+  c = static_cast<int>(highs >= 0x8000000000000000);
+  lows = static_cast<uint64_t>(product);
+
+  lowt = (lows != 0);
+
+  dm1 = ex + ey + c + fputil::FPBits<float>::EXP_BIAS;
+
+  int round_mode = fputil::quick_get_round();
+  if (dm1 >= 255) {
+    if ((round_mode == FE_TOWARDZERO) ||
+        (round_mode == FE_UPWARD && output_sign.is_neg()) ||
+        (round_mode == FE_DOWNWARD && output_sign.is_pos())) {
+      return fputil::FPBits<float>::max_normal(output_sign).get_val();
+    }
+    return fputil::FPBits<float>::inf().get_val();
+  } else if (dm1 <= 0) {
+
+    int m_shift = 40 + c - dm1;
+    int g_shift = m_shift - 1;
+    int h_shift = 64 - g_shift;
+    m = (m_shift >= 64) ? 0 : static_cast<uint32_t>(highs >> m_shift);
+
+    g = g_shift >= 64 ? 0 : (highs >> g_shift) & 1;
+    hight = h_shift >= 64 ? highs : (highs << h_shift) != 0;
+
+    dm1 = 0;
+  } else {
+    m = static_cast<uint32_t>(highs >> (39 + c));
+    g = (highs >> (38 + c)) & 1;
+    hight = (highs << (26 - c)) != 0;
+  }
+
+  if (round_mode == FE_TONEAREST) {
+    b = g && ((hight && lowt) || ((m & 1) != 0));
+  } else if ((output_sign.is_neg() && round_mode == FE_DOWNWARD) ||
+             (output_sign.is_pos() && round_mode == FE_UPWARD)) {
+    b = (g == 0 && (hight && lowt) == 0) ? 0 : 1;
+  } else {
+    b = 0;
+  }
+
+  uint32_t exp16 = (dm1 << 23);
+
+  uint32_t m2 = m & fputil::FPBits<float>::FRACTION_MASK;
+
+  uint32_t result = (exp16 + m2) + b;
+
+  auto result_bits = fputil::FPBits<float>(result);
+  result_bits.set_sign(output_sign);
+  return result_bits.get_val();
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/frexpf16.cpp b/libc/src/math/generic/frexpf16.cpp
new file mode 100644
index 0000000..2d29c07
--- /dev/null
+++ b/libc/src/math/generic/frexpf16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of frexpf16 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/frexpf16.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, frexpf16, (float16 x, int *exp)) {
+  return fputil::frexp(x, *exp);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ilogbf16.cpp b/libc/src/math/generic/ilogbf16.cpp
new file mode 100644
index 0000000..87e43f8
--- /dev/null
+++ b/libc/src/math/generic/ilogbf16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of ilogbf16 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ilogbf16.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, ilogbf16, (float16 x)) {
+  return fputil::intlogb<int>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogbf16.cpp b/libc/src/math/generic/llogbf16.cpp
new file mode 100644
index 0000000..b7a21b9
--- /dev/null
+++ b/libc/src/math/generic/llogbf16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of llogbf16 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogbf16.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogbf16, (float16 x)) {
+  return fputil::intlogb<long>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/logbf16.cpp b/libc/src/math/generic/logbf16.cpp
new file mode 100644
index 0000000..52eb9ac
--- /dev/null
+++ b/libc/src/math/generic/logbf16.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of logbf16 function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logbf16.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, logbf16, (float16 x)) { return fputil::logb(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/modff16.cpp b/libc/src/math/generic/modff16.cpp
new file mode 100644
index 0000000..50cc5b5
--- /dev/null
+++ b/libc/src/math/generic/modff16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of modff16 function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/modff16.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, modff16, (float16 x, float16 *iptr)) {
+  return fputil::modf(x, *iptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nanf16.cpp b/libc/src/math/generic/nanf16.cpp
new file mode 100644
index 0000000..c42cd25
--- /dev/null
+++ b/libc/src/math/generic/nanf16.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of nanf16 function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nanf16.h"
+#include "src/__support/common.h"
+#include "src/__support/str_to_float.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, nanf16, (const char *arg)) {
+  auto result = internal::strtonan<float16>(arg);
+  if (result.has_error())
+    libc_errno = result.error;
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/remainderf16.cpp b/libc/src/math/generic/remainderf16.cpp
new file mode 100644
index 0000000..3517722
--- /dev/null
+++ b/libc/src/math/generic/remainderf16.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of remainderf16 function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remainderf16.h"
+#include "src/__support/FPUtil/DivisionAndRemainderOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, remainderf16, (float16 x, float16 y)) {
+  int quotient;
+  return fputil::remquo(x, y, quotient);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/remquof128.cpp b/libc/src/math/generic/remquof128.cpp
new file mode 100644
index 0000000..e195c7b
--- /dev/null
+++ b/libc/src/math/generic/remquof128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of remquof128 function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remquof128.h"
+#include "src/__support/FPUtil/DivisionAndRemainderOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, remquof128, (float128 x, float128 y, int *exp)) {
+  return fputil::remquo(x, y, *exp);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/remquof16.cpp b/libc/src/math/generic/remquof16.cpp
new file mode 100644
index 0000000..a373bfa
--- /dev/null
+++ b/libc/src/math/generic/remquof16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of remquof16 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remquof16.h"
+#include "src/__support/FPUtil/DivisionAndRemainderOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, remquof16, (float16 x, float16 y, int *exp)) {
+  return fputil::remquo(x, y, *exp);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/ilogbf16.h b/libc/src/math/ilogbf16.h
new file mode 100644
index 0000000..4884a14
--- /dev/null
+++ b/libc/src/math/ilogbf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for ilogbf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_ILOGBF16_H
+#define LLVM_LIBC_SRC_MATH_ILOGBF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+int ilogbf16(float16 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_ILOGBF16_H
diff --git a/libc/src/math/llogbf16.h b/libc/src/math/llogbf16.h
new file mode 100644
index 0000000..267ae41
--- /dev/null
+++ b/libc/src/math/llogbf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogbf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGBF16_H
+#define LLVM_LIBC_SRC_MATH_LLOGBF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogbf16(float16 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGBF16_H
diff --git a/libc/src/math/logbf16.h b/libc/src/math/logbf16.h
new file mode 100644
index 0000000..8082e06
--- /dev/null
+++ b/libc/src/math/logbf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for logbf16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LOGBF16_H
+#define LLVM_LIBC_SRC_MATH_LOGBF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 logbf16(float16 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LOGBF16_H
diff --git a/libc/src/math/modff16.h b/libc/src/math/modff16.h
new file mode 100644
index 0000000..a3017c5
--- /dev/null
+++ b/libc/src/math/modff16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for modff16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_MODFF16_H
+#define LLVM_LIBC_SRC_MATH_MODFF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 modff16(float16 x, float16 *iptr);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_MODFF16_H
diff --git a/libc/src/math/nanf16.h b/libc/src/math/nanf16.h
new file mode 100644
index 0000000..c2db4ba
--- /dev/null
+++ b/libc/src/math/nanf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nanf16 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NANF16_H
+#define LLVM_LIBC_SRC_MATH_NANF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 nanf16(const char *arg);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NANF16_H
diff --git a/libc/src/math/remainderf16.h b/libc/src/math/remainderf16.h
new file mode 100644
index 0000000..e23eead
--- /dev/null
+++ b/libc/src/math/remainderf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for remainderf16 ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_REMAINDERF16_H
+#define LLVM_LIBC_SRC_MATH_REMAINDERF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 remainderf16(float16 x, float16 y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_REMAINDERF16_H
diff --git a/libc/src/math/remquof128.h b/libc/src/math/remquof128.h
new file mode 100644
index 0000000..e9db1ef
--- /dev/null
+++ b/libc/src/math/remquof128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for remquof128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_REMQUOF128_H
+#define LLVM_LIBC_SRC_MATH_REMQUOF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 remquof128(float128 x, float128 y, int *exp);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_REMQUOF128_H
diff --git a/libc/src/math/remquof16.h b/libc/src/math/remquof16.h
new file mode 100644
index 0000000..fee848c
--- /dev/null
+++ b/libc/src/math/remquof16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for remquof16 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_REMQUOF16_H
+#define LLVM_LIBC_SRC_MATH_REMQUOF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 remquof16(float16 x, float16 y, int *exp);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_REMQUOF16_H
diff --git a/libc/test/src/__support/FPUtil/CMakeLists.txt b/libc/test/src/__support/FPUtil/CMakeLists.txt
index 1cbeec0..22fbd26 100644
--- a/libc/test/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/test/src/__support/FPUtil/CMakeLists.txt
@@ -9,6 +9,7 @@ add_fp_unittest(
     dyadic_float_test.cpp
   DEPENDS
     libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.macros.properties.types
   COMPILE_OPTIONS
     # Prevent constant folding with a default rounding mode.
     "-frounding-math"
diff --git a/libc/test/src/__support/FPUtil/dyadic_float_test.cpp b/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
index 809381e..3b1f9de 100644
--- a/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
+++ b/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
@@ -8,6 +8,7 @@
 
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/big_int.h"
+#include "src/__support/macros/properties/types.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
@@ -89,3 +90,6 @@ TEST(LlvmLibcDyadicFloatTest, QuickMul) {
 TEST_EDGE_RANGES(Float, float);
 TEST_EDGE_RANGES(Double, double);
 TEST_EDGE_RANGES(LongDouble, long double);
+#ifdef LIBC_TYPES_HAS_FLOAT16
+TEST_EDGE_RANGES(Float16, float16);
+#endif
diff --git a/libc/test/src/__support/big_int_test.cpp b/libc/test/src/__support/big_int_test.cpp
index 1c4f0ac..84cd206 100644
--- a/libc/test/src/__support/big_int_test.cpp
+++ b/libc/test/src/__support/big_int_test.cpp
@@ -205,6 +205,7 @@ TYPED_TEST(LlvmLibcUIntClassTest, CountBits, Types) {
   }
 }
 
+using LL_UInt16 = UInt<16>;
 using LL_UInt64 = UInt<64>;
 // We want to test UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
@@ -258,6 +259,19 @@ TEST(LlvmLibcUIntClassTest, BitCastToFromNativeFloat128) {
 }
 #endif // LIBC_TYPES_HAS_FLOAT128
 
+#ifdef LIBC_TYPES_HAS_FLOAT16
+TEST(LlvmLibcUIntClassTest, BitCastToFromNativeFloat16) {
+  static_assert(cpp::is_trivially_copyable<LL_UInt16>::value);
+  static_assert(sizeof(LL_UInt16) == sizeof(float16));
+  const float16 array[] = {0, 0.1, 1};
+  for (float16 value : array) {
+    LL_UInt16 back = cpp::bit_cast<LL_UInt16>(value);
+    float16 forth = cpp::bit_cast<float16>(back);
+    EXPECT_TRUE(value == forth);
+  }
+}
+#endif // LIBC_TYPES_HAS_FLOAT16
+
 TEST(LlvmLibcUIntClassTest, BasicInit) {
   LL_UInt128 half_val(12345);
   LL_UInt128 full_val({12345, 67890});
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 07e8b5d..75e2bdd7 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1092,6 +1092,18 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  frexpf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    frexpf16_test.cpp
+  HDRS
+    FrexpTest.h
+  DEPENDS
+    libc.src.math.frexpf16
+)
+
+add_fp_unittest(
   frexpf128_test
   SUITE
     libc-math-smoke-tests
@@ -1353,7 +1365,7 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.ilogb
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1368,7 +1380,7 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.ilogbf
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1383,7 +1395,22 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.ilogbl
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  ilogbf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ilogbf16_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.ilogbf16
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1398,7 +1425,7 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.ilogbf128
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1413,7 +1440,7 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.llogb
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1428,7 +1455,7 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.llogbf
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1443,7 +1470,22 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.llogbl
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogbf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogbf16_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogbf16
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1458,7 +1500,7 @@ add_fp_unittest(
     ILogbTest.h
   DEPENDS
     libc.src.math.llogbf128
-    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -1529,8 +1571,11 @@ add_fp_unittest(
     libc-math-smoke-tests
   SRCS
     logb_test.cpp
+  HDRS
+    LogbTest.h
   DEPENDS
     libc.src.math.logb
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1540,8 +1585,11 @@ add_fp_unittest(
     libc-math-smoke-tests
   SRCS
     logbf_test.cpp
+  HDRS
+    LogbTest.h
   DEPENDS
     libc.src.math.logbf
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1555,6 +1603,21 @@ add_fp_unittest(
     LogbTest.h
   DEPENDS
     libc.src.math.logbl
+    libc.src.__support.CPP.algorithm
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  logbf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    logbf16_test.cpp
+  HDRS
+    LogbTest.h
+  DEPENDS
+    libc.src.math.logbf16
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1564,8 +1627,11 @@ add_fp_unittest(
     libc-math-smoke-tests
   SRCS
     logbf128_test.cpp
+  HDRS
+    LogbTest.h
   DEPENDS
     libc.src.math.logbf128
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.manipulation_functions
 )
 
@@ -1579,6 +1645,7 @@ add_fp_unittest(
     ModfTest.h
   DEPENDS
     libc.src.math.modf
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.nearest_integer_operations
 )
@@ -1593,6 +1660,7 @@ add_fp_unittest(
     ModfTest.h
   DEPENDS
     libc.src.math.modff
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.nearest_integer_operations
 )
@@ -1607,6 +1675,22 @@ add_fp_unittest(
     ModfTest.h
   DEPENDS
     libc.src.math.modfl
+    libc.src.__support.CPP.algorithm
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.nearest_integer_operations
+)
+
+add_fp_unittest(
+  modff16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    modff16_test.cpp
+  HDRS
+    ModfTest.h
+  DEPENDS
+    libc.src.math.modff16
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.nearest_integer_operations
 )
@@ -1621,6 +1705,7 @@ add_fp_unittest(
     ModfTest.h
   DEPENDS
     libc.src.math.modff128
+    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.nearest_integer_operations
 )
@@ -2401,6 +2486,19 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  fmul_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmul_test.cpp
+  HDRS
+    FMulTest.h
+  DEPENDS
+    libc.src.math.fmul
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
   sqrtf_test
   SUITE
     libc-math-smoke-tests
@@ -2510,6 +2608,19 @@ add_fp_unittest(
     RemQuoTest.h
   DEPENDS
     libc.src.math.remquof
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
+  remquof128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    remquof128_test.cpp
+  HDRS
+    RemQuoTest.h
+  DEPENDS
+    libc.src.math.remquof128
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
 )
@@ -2524,7 +2635,6 @@ add_fp_unittest(
     RemQuoTest.h
   DEPENDS
     libc.src.math.remquo
-    libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fp_bits
 )
 
@@ -2538,7 +2648,19 @@ add_fp_unittest(
     RemQuoTest.h
   DEPENDS
     libc.src.math.remquol
-    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
+  remquof16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    remquof16_test.cpp
+  HDRS
+    RemQuoTest.h
+  DEPENDS
+    libc.src.math.remquof16
     libc.src.__support.FPUtil.fp_bits
 )
 
@@ -2610,6 +2732,22 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  nanf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nanf16_test.cpp
+  DEPENDS
+    libc.include.signal
+    libc.src.math.nanf16
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.sanitizer
+  # FIXME: The nan tests currently have death tests, which aren't supported for
+  # hermetic tests.
+  UNIT_TEST_ONLY
+)
+
+add_fp_unittest(
   nanf128_test
   SUITE
     libc-math-smoke-tests
diff --git a/libc/test/src/math/smoke/FMulTest.h b/libc/test/src/math/smoke/FMulTest.h
new file mode 100644
index 0000000..33fb82c
--- /dev/null
+++ b/libc/test/src/math/smoke/FMulTest.h
@@ -0,0 +1,104 @@
+//===-- Utility class to test fmul[f|l] ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H
+
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename T, typename R>
+class FmulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*FMulFunc)(R, R);
+
+  void testMul(FMulFunc func) {
+
+    EXPECT_FP_EQ_ALL_ROUNDING(T(15.0), func(3.0, 5.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0x1.0p-130), func(0x1.0p1, 0x1.0p-131));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0x1.0p-127), func(0x1.0p2, 0x1.0p-129));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(1.0), func(1.0, 1.0));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(T(0.0), func(-0.0, -0.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.0), func(0.0, -0.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(T(-0.0), func(-0.0, 0.0));
+
+    EXPECT_FP_EQ_ROUNDING_NEAREST(inf, func(0x1.0p100, 0x1.0p100));
+    EXPECT_FP_EQ_ROUNDING_UPWARD(inf, func(0x1.0p100, 0x1.0p100));
+    EXPECT_FP_EQ_ROUNDING_DOWNWARD(max_normal, func(0x1.0p100, 0x1.0p100));
+    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(max_normal, func(0x1.0p100, 0x1.0p100));
+
+    EXPECT_FP_EQ_ROUNDING_NEAREST(
+        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+    EXPECT_FP_EQ_ROUNDING_DOWNWARD(
+        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(
+        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+    EXPECT_FP_EQ_ROUNDING_UPWARD(
+        0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+
+    EXPECT_FP_EQ_ROUNDING_NEAREST(
+        0x1.0p-128f + 0x1.0p-148f,
+        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+    EXPECT_FP_EQ_ROUNDING_UPWARD(
+        0x1.0p-128f + 0x1.0p-148f,
+        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+    EXPECT_FP_EQ_ROUNDING_DOWNWARD(
+        0x1.0p-128f + 0x1.0p-149f,
+        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(
+        0x1.0p-128f + 0x1.0p-149f,
+        func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150));
+  }
+
+  void testSpecialInputs(FMulFunc func) {
+    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, 0x1.0p-129));
+    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(0x1.0p-129, inf));
+    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, 2.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(3.0, inf));
+    EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, 0.0));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(neg_inf, aNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, neg_inf));
+    EXPECT_FP_EQ_ALL_ROUNDING(inf, func(neg_inf, neg_inf));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, neg_inf));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(neg_inf, 0.0));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, 1.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(1.0, neg_inf));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, 0x1.0p-129));
+    EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(0x1.0p-129, neg_inf));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, 0x1.0p-129));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, 0.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, inf));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, aNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(2.0, aNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0x1.0p-129, aNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, aNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, aNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, sNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(2.0, sNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0x1.0p-129, sNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, sNaN));
+    EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(sNaN, sNaN));
+  }
+};
+
+#define LIST_FMUL_TESTS(T, R, func)                                            \
+  using LlvmLibcFmulTest = FmulTest<T, R>;                                     \
+  TEST_F(LlvmLibcFmulTest, Mul) { testMul(&func); }                            \
+  TEST_F(LlvmLibcFmulTest, NaNInf) { testSpecialInputs(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H
diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h
index e9e4964..fc2313a 100644
--- a/libc/test/src/math/smoke/FrexpTest.h
+++ b/libc/test/src/math/smoke/FrexpTest.h
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/FPUtil/BasicOperations.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index 05f906b..3315ac2 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
 
-#include "src/__support/CPP/limits.h" // INT_MAX
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "test/UnitTest/FEnvSafeTest.h"
@@ -76,10 +76,12 @@ public:
   void test_subnormal_range(Func func) {
     constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
     constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
-    constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    constexpr int COUNT = 10'001;
+    constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max(
+        static_cast<StorageType>((MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT),
+        StorageType(1));
     for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
-      FPBits x_bits = FPBits(v);
+      FPBits x_bits(v);
       if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
@@ -94,10 +96,12 @@ public:
   void test_normal_range(Func func) {
     constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
     constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
-    constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    constexpr int COUNT = 10'001;
+    constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max(
+        static_cast<StorageType>((MAX_NORMAL - MIN_NORMAL) / COUNT),
+        StorageType(1));
     for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
-      FPBits x_bits = FPBits(v);
+      FPBits x_bits(v);
       if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
diff --git a/libc/test/src/math/smoke/LogbTest.h b/libc/test/src/math/smoke/LogbTest.h
index 4938fcf..0bb6e12 100644
--- a/libc/test/src/math/smoke/LogbTest.h
+++ b/libc/test/src/math/smoke/LogbTest.h
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
@@ -69,10 +70,12 @@ public:
 
   void testRange(LogbFunc func) {
     using StorageType = typename FPBits::StorageType;
-    constexpr StorageType COUNT = 100'000;
-    constexpr StorageType STEP = STORAGE_MAX / COUNT;
-    for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      FPBits x_bits = FPBits(v);
+    constexpr int COUNT = 100'000;
+    constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max(
+        static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1));
+    StorageType v = 0;
+    for (int i = 0; i <= COUNT; ++i, v += STEP) {
+      FPBits x_bits(v);
       if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
diff --git a/libc/test/src/math/smoke/ModfTest.h b/libc/test/src/math/smoke/ModfTest.h
index 85db2d6..6226e5d 100644
--- a/libc/test/src/math/smoke/ModfTest.h
+++ b/libc/test/src/math/smoke/ModfTest.h
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "test/UnitTest/FEnvSafeTest.h"
@@ -83,10 +84,12 @@ public:
   }
 
   void testRange(ModfFunc func) {
-    constexpr StorageType COUNT = 100'000;
-    constexpr StorageType STEP = STORAGE_MAX / COUNT;
-    for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      FPBits x_bits = FPBits(v);
+    constexpr int COUNT = 100'000;
+    constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max(
+        static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1));
+    StorageType v = 0;
+    for (int i = 0; i <= COUNT; ++i, v += STEP) {
+      FPBits x_bits(v);
       if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 43eee3d..e926326 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 
-#include "hdr/math_macros.h"
-#include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
diff --git a/libc/test/src/math/smoke/fmul_test.cpp b/libc/test/src/math/smoke/fmul_test.cpp
new file mode 100644
index 0000000..0eb664f
--- /dev/null
+++ b/libc/test/src/math/smoke/fmul_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmul-------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "FMulTest.h"
+
+#include "src/math/fmul.h"
+
+LIST_FMUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
diff --git a/libc/test/src/math/smoke/frexpf16_test.cpp b/libc/test/src/math/smoke/frexpf16_test.cpp
new file mode 100644
index 0000000..4d5492c
--- /dev/null
+++ b/libc/test/src/math/smoke/frexpf16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for frexpf16 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FrexpTest.h"
+
+#include "src/math/frexpf16.h"
+
+LIST_FREXP_TESTS(float16, LIBC_NAMESPACE::frexpf16);
diff --git a/libc/test/src/math/smoke/ilogbf16_test.cpp b/libc/test/src/math/smoke/ilogbf16_test.cpp
new file mode 100644
index 0000000..e046709
--- /dev/null
+++ b/libc/test/src/math/smoke/ilogbf16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ilogbf16 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/ilogbf16.h"
+
+LIST_INTLOGB_TESTS(int, float16, LIBC_NAMESPACE::ilogbf16);
diff --git a/libc/test/src/math/smoke/llogbf16_test.cpp b/libc/test/src/math/smoke/llogbf16_test.cpp
new file mode 100644
index 0000000..8907681
--- /dev/null
+++ b/libc/test/src/math/smoke/llogbf16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogbf16 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogbf16.h"
+
+LIST_INTLOGB_TESTS(long, float16, LIBC_NAMESPACE::llogbf16);
diff --git a/libc/test/src/math/smoke/logbf16_test.cpp b/libc/test/src/math/smoke/logbf16_test.cpp
new file mode 100644
index 0000000..cfc1a05
--- /dev/null
+++ b/libc/test/src/math/smoke/logbf16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for logbf16 ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LogbTest.h"
+
+#include "src/math/logbf16.h"
+
+LIST_LOGB_TESTS(float16, LIBC_NAMESPACE::logbf16)
diff --git a/libc/test/src/math/smoke/modff16_test.cpp b/libc/test/src/math/smoke/modff16_test.cpp
new file mode 100644
index 0000000..7093377
--- /dev/null
+++ b/libc/test/src/math/smoke/modff16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for modff16 ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ModfTest.h"
+
+#include "src/math/modff16.h"
+
+LIST_MODF_TESTS(float16, LIBC_NAMESPACE::modff16)
diff --git a/libc/test/src/math/smoke/nanf16_test.cpp b/libc/test/src/math/smoke/nanf16_test.cpp
new file mode 100644
index 0000000..ec17a73
--- /dev/null
+++ b/libc/test/src/math/smoke/nanf16_test.cpp
@@ -0,0 +1,51 @@
+//===-- Unittests for nanf16 ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
+#include "src/math/nanf16.h"
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include <signal.h>
+
+class LlvmLibcNanf16Test : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+public:
+  using StorageType = LIBC_NAMESPACE::fputil::FPBits<float16>::StorageType;
+
+  void run_test(const char *input_str, StorageType bits) {
+    float16 result = LIBC_NAMESPACE::nanf16(input_str);
+    auto actual_fp = LIBC_NAMESPACE::fputil::FPBits<float16>(result);
+    auto expected_fp = LIBC_NAMESPACE::fputil::FPBits<float16>(bits);
+    EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
+  };
+};
+
+TEST_F(LlvmLibcNanf16Test, NCharSeq) {
+  run_test("", 0x7e00);
+  run_test("123", 0x7e7b);
+  run_test("0x123", 0x7f23);
+  run_test("1a", 0x7e00);
+  run_test("1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM_",
+           0x7e00);
+  run_test("10000000000000000000000000000000000000000000000000", 0x7e00);
+}
+
+TEST_F(LlvmLibcNanf16Test, RandomString) {
+  run_test(" 1234", 0x7e00);
+  run_test("-1234", 0x7e00);
+  run_test("asd&f", 0x7e00);
+  run_test("123 ", 0x7e00);
+}
+
+#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+TEST_F(LlvmLibcNanf16Test, InvalidInput) {
+  EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }, WITH_SIGNAL(SIGSEGV));
+}
+#endif // LIBC_HAVE_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/remquof128_test.cpp b/libc/test/src/math/smoke/remquof128_test.cpp
new file mode 100644
index 0000000..8ef6c3b
--- /dev/null
+++ b/libc/test/src/math/smoke/remquof128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for remquof128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RemQuoTest.h"
+
+#include "src/math/remquof128.h"
+
+LIST_REMQUO_TESTS(float128, LIBC_NAMESPACE::remquof128)
diff --git a/libc/test/src/math/smoke/remquof16_test.cpp b/libc/test/src/math/smoke/remquof16_test.cpp
new file mode 100644
index 0000000..18f2aba
--- /dev/null
+++ b/libc/test/src/math/smoke/remquof16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for remquof16 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RemQuoTest.h"
+
+#include "src/math/remquof16.h"
+
+LIST_REMQUO_TESTS(float16, LIBC_NAMESPACE::remquof16)
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index cfe1f44..d65b7ce 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -250,6 +250,7 @@ set(files
   __chrono/convert_to_tm.h
   __chrono/day.h
   __chrono/duration.h
+  __chrono/exception.h
   __chrono/file_clock.h
   __chrono/formatter.h
   __chrono/hh_mm_ss.h
@@ -276,6 +277,7 @@ set(files
   __chrono/year_month.h
   __chrono/year_month_day.h
   __chrono/year_month_weekday.h
+  __chrono/zoned_time.h
   __compare/common_comparison_category.h
   __compare/compare_partial_order_fallback.h
   __compare/compare_strong_order_fallback.h
diff --git a/libcxx/include/__chrono/exception.h b/libcxx/include/__chrono/exception.h
new file mode 100644
index 0000000..75fd061
--- /dev/null
+++ b/libcxx/include/__chrono/exception.h
@@ -0,0 +1,129 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// For information see https://libcxx.llvm.org/DesignDocs/TimeZone.html
+
+#ifndef _LIBCPP___CHRONO_EXCEPTION_H
+#define _LIBCPP___CHRONO_EXCEPTION_H
+
+#include <version>
+// Enable the contents of the header only when libc++ was built with experimental features enabled.
+#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+#  include <__chrono/calendar.h>
+#  include <__chrono/local_info.h>
+#  include <__chrono/time_point.h>
+#  include <__config>
+#  include <__configuration/availability.h>
+#  include <__verbose_abort>
+#  include <format>
+#  include <stdexcept>
+#  include <string>
+
+#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#    pragma GCC system_header
+#  endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#  if _LIBCPP_STD_VER >= 20
+
+namespace chrono {
+
+class nonexistent_local_time : public runtime_error {
+public:
+  template <class _Duration>
+  _LIBCPP_HIDE_FROM_ABI nonexistent_local_time(const local_time<_Duration>& __time, const local_info& __info)
+      : runtime_error{__create_message(__time, __info)} {
+    // [time.zone.exception.nonexist]/2
+    //   Preconditions: i.result == local_info::nonexistent is true.
+    // The value of __info.result is not used.
+    _LIBCPP_ASSERT_PEDANTIC(__info.result == local_info::nonexistent,
+                            "creating an nonexistent_local_time from a local_info that is not non-existent");
+  }
+
+  _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI ~nonexistent_local_time() override; // exported as key function
+
+private:
+  template <class _Duration>
+  _LIBCPP_HIDE_FROM_ABI string __create_message(const local_time<_Duration>& __time, const local_info& __info) {
+    return std::format(
+        R"({} is in a gap between
+{} {} and
+{} {} which are both equivalent to
+{} UTC)",
+        __time,
+        local_seconds{__info.first.end.time_since_epoch()} + __info.first.offset,
+        __info.first.abbrev,
+        local_seconds{__info.second.begin.time_since_epoch()} + __info.second.offset,
+        __info.second.abbrev,
+        __info.first.end);
+  }
+};
+
+template <class _Duration>
+_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_nonexistent_local_time(
+    [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) {
+#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  throw nonexistent_local_time(__time, __info);
+#    else
+  _LIBCPP_VERBOSE_ABORT("nonexistent_local_time was thrown in -fno-exceptions mode");
+#    endif
+}
+
+class ambiguous_local_time : public runtime_error {
+public:
+  template <class _Duration>
+  _LIBCPP_HIDE_FROM_ABI ambiguous_local_time(const local_time<_Duration>& __time, const local_info& __info)
+      : runtime_error{__create_message(__time, __info)} {
+    // [time.zone.exception.ambig]/2
+    //   Preconditions: i.result == local_info::ambiguous is true.
+    // The value of __info.result is not used.
+    _LIBCPP_ASSERT_PEDANTIC(__info.result == local_info::ambiguous,
+                            "creating an ambiguous_local_time from a local_info that is not ambiguous");
+  }
+
+  _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI ~ambiguous_local_time() override; // exported as key function
+
+private:
+  template <class _Duration>
+  _LIBCPP_HIDE_FROM_ABI string __create_message(const local_time<_Duration>& __time, const local_info& __info) {
+    return std::format(
+        // There are two spaces after the full-stop; this has been verified
+        // in the sources of the Standard.
+        R"({0} is ambiguous.  It could be
+{0} {1} == {2} UTC or
+{0} {3} == {4} UTC)",
+        __time,
+        __info.first.abbrev,
+        __time - __info.first.offset,
+        __info.second.abbrev,
+        __time - __info.second.offset);
+  }
+};
+
+template <class _Duration>
+_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_ambiguous_local_time(
+    [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) {
+#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  throw ambiguous_local_time(__time, __info);
+#    else
+  _LIBCPP_VERBOSE_ABORT("ambiguous_local_time was thrown in -fno-exceptions mode");
+#    endif
+}
+
+} // namespace chrono
+
+#  endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+#endif // _LIBCPP___CHRONO_EXCEPTION_H
diff --git a/libcxx/include/__chrono/time_zone.h b/libcxx/include/__chrono/time_zone.h
index 91ddab8..de11dac 100644
--- a/libcxx/include/__chrono/time_zone.h
+++ b/libcxx/include/__chrono/time_zone.h
@@ -16,12 +16,16 @@
 // Enable the contents of the header only when libc++ was built with experimental features enabled.
 #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
 
+#  include <__chrono/calendar.h>
 #  include <__chrono/duration.h>
+#  include <__chrono/exception.h>
+#  include <__chrono/local_info.h>
 #  include <__chrono/sys_info.h>
 #  include <__chrono/system_clock.h>
 #  include <__compare/strong_order.h>
 #  include <__config>
 #  include <__memory/unique_ptr.h>
+#  include <__type_traits/common_type.h>
 #  include <string_view>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -38,6 +42,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace chrono {
 
+enum class choose { earliest, latest };
+
 class _LIBCPP_AVAILABILITY_TZDB time_zone {
   _LIBCPP_HIDE_FROM_ABI time_zone() = default;
 
@@ -63,12 +69,91 @@ public:
     return __get_info(chrono::time_point_cast<seconds>(__time));
   }
 
+  template <class _Duration>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI local_info get_info(const local_time<_Duration>& __time) const {
+    return __get_info(chrono::time_point_cast<seconds>(__time));
+  }
+
+  // We don't apply nodiscard here since this function throws on many inputs,
+  // so it could be used as a validation.
+  template <class _Duration>
+  _LIBCPP_HIDE_FROM_ABI sys_time<common_type_t<_Duration, seconds>> to_sys(const local_time<_Duration>& __time) const {
+    local_info __info = get_info(__time);
+    switch (__info.result) {
+    case local_info::unique:
+      return sys_time<common_type_t<_Duration, seconds>>{__time.time_since_epoch() - __info.first.offset};
+
+    case local_info::nonexistent:
+      chrono::__throw_nonexistent_local_time(__time, __info);
+
+    case local_info::ambiguous:
+      chrono::__throw_ambiguous_local_time(__time, __info);
+    }
+
+    // TODO TZDB The Standard does not specify anything in these cases.
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __info.result != -1, "cannot convert the local time; it would be before the minimum system clock value");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __info.result != -2, "cannot convert the local time; it would be after the maximum system clock value");
+
+    return {};
+  }
+
+  template <class _Duration>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI sys_time<common_type_t<_Duration, seconds>>
+  to_sys(const local_time<_Duration>& __time, choose __z) const {
+    local_info __info = get_info(__time);
+    switch (__info.result) {
+    case local_info::unique:
+    case local_info::nonexistent: // first and second are the same
+      return sys_time<common_type_t<_Duration, seconds>>{__time.time_since_epoch() - __info.first.offset};
+
+    case local_info::ambiguous:
+      switch (__z) {
+      case choose::earliest:
+        return sys_time<common_type_t<_Duration, seconds>>{__time.time_since_epoch() - __info.first.offset};
+
+      case choose::latest:
+        return sys_time<common_type_t<_Duration, seconds>>{__time.time_since_epoch() - __info.second.offset};
+
+        // Note a value out of bounds is not specified.
+      }
+    }
+
+    // TODO TZDB The standard does not specify anything in these cases.
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __info.result != -1, "cannot convert the local time; it would be before the minimum system clock value");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __info.result != -2, "cannot convert the local time; it would be after the maximum system clock value");
+
+    return {};
+  }
+
+  template <class _Duration>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI local_time<common_type_t<_Duration, seconds>>
+  to_local(const sys_time<_Duration>& __time) const {
+    using _Dp = common_type_t<_Duration, seconds>;
+
+    sys_info __info = get_info(__time);
+
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __info.offset >= chrono::seconds{0} || __time.time_since_epoch() >= _Dp::min() - __info.offset,
+        "cannot convert the system time; it would be before the minimum local clock value");
+
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __info.offset <= chrono::seconds{0} || __time.time_since_epoch() <= _Dp::max() - __info.offset,
+        "cannot convert the system time; it would be after the maximum local clock value");
+
+    return local_time<_Dp>{__time.time_since_epoch() + __info.offset};
+  }
+
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI const __impl& __implementation() const noexcept { return *__impl_; }
 
 private:
   [[nodiscard]] _LIBCPP_EXPORTED_FROM_ABI string_view __name() const noexcept;
 
   [[nodiscard]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI sys_info __get_info(sys_seconds __time) const;
+  [[nodiscard]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI local_info __get_info(local_seconds __time) const;
 
   unique_ptr<__impl> __impl_;
 };
diff --git a/libcxx/include/__chrono/zoned_time.h b/libcxx/include/__chrono/zoned_time.h
new file mode 100644
index 0000000..c608442
--- /dev/null
+++ b/libcxx/include/__chrono/zoned_time.h
@@ -0,0 +1,55 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// For information see https://libcxx.llvm.org/DesignDocs/TimeZone.html
+
+#ifndef _LIBCPP___CHRONO_ZONED_TIME_H
+#define _LIBCPP___CHRONO_ZONED_TIME_H
+
+#include <version>
+// Enable the contents of the header only when libc++ was built with experimental features enabled.
+#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+#  include <__chrono/time_zone.h>
+#  include <__chrono/tzdb_list.h>
+#  include <__config>
+#  include <__fwd/string_view.h>
+
+#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#    pragma GCC system_header
+#  endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#  if _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&   \
+      !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+
+namespace chrono {
+
+template <class>
+struct zoned_traits {};
+
+template <>
+struct zoned_traits<const time_zone*> {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static const time_zone* default_zone() { return chrono::locate_zone("UTC"); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static const time_zone* locate_zone(string_view __name) {
+    return chrono::locate_zone(__name);
+  }
+};
+
+} // namespace chrono
+
+#  endif // _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM)
+         // && !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+#endif // _LIBCPP___CHRONO_ZONED_TIME_H
diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h
index 6aa91c8..f7be2dc 100644
--- a/libcxx/include/__format/escaped_output_table.h
+++ b/libcxx/include/__format/escaped_output_table.h
@@ -833,7 +833,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[711] = {
 /// more details.
 
 ///
-/// \pre The code point is a valid Unicode code point.
+/// \\pre The code point is a valid Unicode code point.
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {
 
   // The entries in the gap at the end.
diff --git a/libcxx/include/__format/width_estimation_table.h b/libcxx/include/__format/width_estimation_table.h
index c9a9f671..11f61de 100644
--- a/libcxx/include/__format/width_estimation_table.h
+++ b/libcxx/include/__format/width_estimation_table.h
@@ -237,7 +237,7 @@ inline constexpr uint32_t __table_upper_bound = 0x0003fffd;
 
 /// Returns the estimated width of a Unicode code point.
 ///
-/// \pre The code point is a valid Unicode code point.
+/// \\pre The code point is a valid Unicode code point.
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {
   // Since __table_upper_bound contains the unshifted range do the
   // comparison without shifting.
diff --git a/libcxx/include/__type_traits/promote.h b/libcxx/include/__type_traits/promote.h
index e22b4a4..2b2a684 100644
--- a/libcxx/include/__type_traits/promote.h
+++ b/libcxx/include/__type_traits/promote.h
@@ -11,8 +11,12 @@
 
 #include <__config>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/is_same.h>
-#include <__utility/declval.h>
+#include <__type_traits/is_arithmetic.h>
+
+#if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER == 1700
+#  include <__type_traits/is_same.h>
+#  include <__utility/declval.h>
+#endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -20,6 +24,34 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+// TODO(LLVM-20): Remove this workaround
+#if !defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER != 1700
+
+template <class... _Args>
+class __promote {
+  static_assert((is_arithmetic<_Args>::value && ...));
+
+  static float __test(float);
+  static double __test(char);
+  static double __test(int);
+  static double __test(unsigned);
+  static double __test(long);
+  static double __test(unsigned long);
+  static double __test(long long);
+  static double __test(unsigned long long);
+#  ifndef _LIBCPP_HAS_NO_INT128
+  static double __test(__int128_t);
+  static double __test(__uint128_t);
+#  endif
+  static double __test(double);
+  static long double __test(long double);
+
+public:
+  using type = decltype((__test(_Args()) + ...));
+};
+
+#else
+
 template <class _Tp>
 struct __numeric_type {
   static void __test(...);
@@ -31,10 +63,10 @@ struct __numeric_type {
   static double __test(unsigned long);
   static double __test(long long);
   static double __test(unsigned long long);
-#ifndef _LIBCPP_HAS_NO_INT128
+#  ifndef _LIBCPP_HAS_NO_INT128
   static double __test(__int128_t);
   static double __test(__uint128_t);
-#endif
+#  endif
   static double __test(double);
   static long double __test(long double);
 
@@ -89,6 +121,8 @@ public:
 template <class _A1, class _A2 = void, class _A3 = void>
 class __promote : public __promote_imp<_A1, _A2, _A3> {};
 
+#endif // !defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER >= 1700
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_PROMOTE_H
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 96a3e92..c1a92595 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -724,6 +724,10 @@ const time_zone* current_zone()
 const tzdb& reload_tzdb();                                                       // C++20
 string remote_version();                                                         // C++20
 
+// [time.zone.exception], exception classes
+class nonexistent_local_time;                                                    // C++20
+class ambiguous_local_time;                                                      // C++20
+
 // [time.zone.info], information classes
 struct sys_info {                                                                // C++20
   sys_seconds   begin;
@@ -763,10 +767,28 @@ class time_zone {
 
   template<class Duration>
   sys_info get_info(const sys_time<Duration>& st) const;
+
+  template<class Duration>
+  local_info get_info(const local_time<Duration>& tp) const;
+
+  template<class Duration>
+  sys_time<common_type_t<Duration, seconds>>
+    to_sys(const local_time<Duration>& tp) const;
+
+  template<class Duration>
+  sys_time<common_type_t<Duration, seconds>>
+    to_sys(const local_time<Duration>& tp, choose z) const;
+
+  template<class Duration>
+  local_time<common_type_t<Duration, seconds>>
+    to_local(const sys_time<Duration>& tp) const;
 };
 bool operator==(const time_zone& x, const time_zone& y) noexcept;                // C++20
 strong_ordering operator<=>(const time_zone& x, const time_zone& y) noexcept;    // C++20
 
+// [time.zone.zonedtraits], class template zoned_traits
+template<class T> struct zoned_traits;                                           // C++20
+
 // [time.zone.leap], leap second support
 class leap_second {                                                              // C++20
 public:
@@ -912,6 +934,7 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 #if _LIBCPP_STD_VER >= 20
 #  include <__chrono/calendar.h>
 #  include <__chrono/day.h>
+#  include <__chrono/exception.h>
 #  include <__chrono/hh_mm_ss.h>
 #  include <__chrono/literals.h>
 #  include <__chrono/local_info.h>
@@ -939,6 +962,7 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 #    include <__chrono/time_zone_link.h>
 #    include <__chrono/tzdb.h>
 #    include <__chrono/tzdb_list.h>
+#    include <__chrono/zoned_time.h>
 #  endif
 
 #endif
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 48391b2..892d2c6 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1101,6 +1101,7 @@ module std_private_chrono_duration               [system] {
   header "__chrono/duration.h"
   export std_private_type_traits_is_convertible
 }
+module std_private_chrono_exception              [system] { header "__chrono/exception.h" }
 module std_private_chrono_file_clock             [system] { header "__chrono/file_clock.h" }
 module std_private_chrono_formatter              [system] {
   header "__chrono/formatter.h"
@@ -1113,7 +1114,10 @@ module std_private_chrono_high_resolution_clock  [system] {
 }
 module std_private_chrono_leap_second            [system] { header "__chrono/leap_second.h" }
 module std_private_chrono_literals               [system] { header "__chrono/literals.h" }
-module std_private_chrono_local_info             [system] { header "__chrono/local_info.h" }
+module std_private_chrono_local_info             [system] {
+  header "__chrono/local_info.h"
+  export std_private_chrono_sys_info
+}
 module std_private_chrono_month                  [system] { header "__chrono/month.h" }
 module std_private_chrono_month_weekday          [system] { header "__chrono/month_weekday.h" }
 module std_private_chrono_monthday               [system] { header "__chrono/monthday.h" }
@@ -1155,6 +1159,7 @@ module std_private_chrono_year                   [system] { header "__chrono/yea
 module std_private_chrono_year_month             [system] { header "__chrono/year_month.h" }
 module std_private_chrono_year_month_day         [system] { header "__chrono/year_month_day.h" }
 module std_private_chrono_year_month_weekday     [system] { header "__chrono/year_month_weekday.h" }
+module std_private_chrono_zoned_time             [system] { header "__chrono/zoned_time.h" }
 
 module std_private_compare_common_comparison_category     [system] { header "__compare/common_comparison_category.h" }
 module std_private_compare_compare_partial_order_fallback [system] { header "__compare/compare_partial_order_fallback.h" }
diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc
index 813322a..87e32af 100644
--- a/libcxx/modules/std/chrono.inc
+++ b/libcxx/modules/std/chrono.inc
@@ -209,13 +209,12 @@ export namespace std {
     using std::chrono::reload_tzdb;
     using std::chrono::remote_version;
 
-#    if 0
+#  endif //  !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&
+         //  !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+
     // [time.zone.exception], exception classes
     using std::chrono::ambiguous_local_time;
     using std::chrono::nonexistent_local_time;
-#    endif // if 0
-#  endif   //  !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&
-           //  !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 
     // [time.zone.info], information classes
     using std::chrono::local_info;
@@ -224,18 +223,14 @@ export namespace std {
 #  if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                            \
       !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 
-#    if 0
     // [time.zone.timezone], class time_zone
     using std::chrono::choose;
-#    endif // if 0
-
     using std::chrono::time_zone;
 
-#    if 0
-
     // [time.zone.zonedtraits], class template zoned_traits
     using std::chrono::zoned_traits;
 
+#    if 0
     // [time.zone.zonedtime], class template zoned_time
     using std::chrono::zoned_time;
 
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 65e6ce2..9e6c703 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -340,6 +340,9 @@ if (LIBCXX_ENABLE_LOCALIZATION AND LIBCXX_ENABLE_FILESYSTEM AND LIBCXX_ENABLE_TI
     include/tzdb/types_private.h
     include/tzdb/tzdb_list_private.h
     include/tzdb/tzdb_private.h
+    # TODO TZDB The exception could be moved in chrono once the TZDB library
+    # is no longer experimental.
+    chrono_exception.cpp
     time_zone.cpp
     tzdb.cpp
     tzdb_list.cpp
diff --git a/libcxx/src/chrono_exception.cpp b/libcxx/src/chrono_exception.cpp
new file mode 100644
index 0000000..bea2ad1
--- /dev/null
+++ b/libcxx/src/chrono_exception.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <chrono>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace chrono {
+
+_LIBCPP_AVAILABILITY_TZDB
+_LIBCPP_EXPORTED_FROM_ABI nonexistent_local_time::~nonexistent_local_time() = default; // key function
+_LIBCPP_AVAILABILITY_TZDB
+_LIBCPP_EXPORTED_FROM_ABI ambiguous_local_time::~ambiguous_local_time() = default; // key function
+
+} // namespace chrono
+
+_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/src/time_zone.cpp b/libcxx/src/time_zone.cpp
index 928f3d2..764a89a 100644
--- a/libcxx/src/time_zone.cpp
+++ b/libcxx/src/time_zone.cpp
@@ -34,6 +34,7 @@
 #include <chrono>
 #include <expected>
 #include <map>
+#include <numeric>
 #include <ranges>
 
 #include "include/tzdb/time_zone_private.h"
@@ -903,6 +904,152 @@ time_zone::__get_info(sys_seconds __time) const {
   std::__throw_runtime_error("tzdb: corrupt db");
 }
 
+// Is the "__local_time" present in "__first" and "__second". If so the
+// local_info has an ambiguous result.
+[[nodiscard]] static bool
+__is_ambiguous(local_seconds __local_time, const sys_info& __first, const sys_info& __second) {
+  std::chrono::local_seconds __end_first{__first.end.time_since_epoch() + __first.offset};
+  std::chrono::local_seconds __begin_second{__second.begin.time_since_epoch() + __second.offset};
+
+  return __local_time < __end_first && __local_time >= __begin_second;
+}
+
+// Determines the result of the "__local_time". This expects the object
+// "__first" to be earlier in time than "__second".
+[[nodiscard]] static local_info
+__get_info(local_seconds __local_time, const sys_info& __first, const sys_info& __second) {
+  std::chrono::local_seconds __end_first{__first.end.time_since_epoch() + __first.offset};
+  std::chrono::local_seconds __begin_second{__second.begin.time_since_epoch() + __second.offset};
+
+  if (__local_time < __end_first) {
+    if (__local_time >= __begin_second)
+      // |--------|
+      //        |------|
+      //         ^
+      return {local_info::ambiguous, __first, __second};
+
+    // |--------|
+    //          |------|
+    //         ^
+    return {local_info::unique, __first, sys_info{}};
+  }
+
+  if (__local_time < __begin_second)
+    // |--------|
+    //             |------|
+    //           ^
+    return {local_info::nonexistent, __first, __second};
+
+  // |--------|
+  //          |------|
+  //           ^
+  return {local_info::unique, __second, sys_info{}};
+}
+
+[[nodiscard]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI local_info
+time_zone::__get_info(local_seconds __local_time) const {
+  seconds __local_seconds = __local_time.time_since_epoch();
+
+  /* An example of a typical year with a DST switch displayed in local time.
+   *
+   * At the first of April the time goes forward one hour. This means the
+   * time marked with ~~ is not a valid local time. This is represented by the
+   * nonexistent value in local_info.result.
+   *
+   * At the first of November the time goes backward one hour. This means the
+   * time marked with ^^ happens twice. This is represented by the ambiguous
+   * value in local_info.result.
+   *
+   * 2020.11.01                  2021.04.01              2021.11.01
+   * offset +05                  offset +05              offset +05
+   * save    0s                  save    1h              save    0s
+   * |------------//----------|
+   *                             |---------//--------------|
+   *                                                    |-------------
+   *                           ~~                        ^^
+   *
+   * These shifts can happen due to changes in the current time zone for a
+   * location. For example, Indian/Kerguelen switched only once. In 1950 from an
+   * offset of 0 hours to an offset of +05 hours.
+   *
+   * During all these shifts the UTC time will not have gaps.
+   */
+
+  // The code needs to determine the system time for the local time. There is no
+  // information available. Assume the offset between system time and local time
+  // is 0s. This gives an initial estimate.
+  sys_seconds __guess{__local_seconds};
+  sys_info __info = __get_info(__guess);
+
+  // At this point the offset can be used to determine an estimate for the local
+  // time. Before doing that, determine the offset and validate whether the
+  // local time is the range [chrono::local_seconds::min(),
+  // chrono::local_seconds::max()).
+  if (__local_seconds < 0s && __info.offset > 0s)
+    if (__local_seconds - chrono::local_seconds::min().time_since_epoch() < __info.offset)
+      return {-1, __info, {}};
+
+  if (__local_seconds > 0s && __info.offset < 0s)
+    if (chrono::local_seconds::max().time_since_epoch() - __local_seconds < -__info.offset)
+      return {-2, __info, {}};
+
+  // Based on the information found in the sys_info, the local time can be
+  // converted to a system time. This resulting time can be in the following
+  // locations of the sys_info:
+  //
+  //                             |---------//--------------|
+  //                           1   2.1      2.2         2.3  3
+  //
+  // 1. The estimate is before the returned sys_info object.
+  //    The result is either non-existent or unique in the previous sys_info.
+  // 2. The estimate is in the sys_info object
+  //    - If the sys_info begin is not sys_seconds::min(), then it might be at
+  //      2.1 and could be ambiguous with the previous or unique.
+  //    - If sys_info end is not sys_seconds::max(), then it might be at 2.3
+  //      and could be ambiguous with the next or unique.
+  //    - Else it is at 2.2 and always unique. This case happens when a
+  //      time zone has no transitions. For example, UTC or GMT+1.
+  // 3. The estimate is after the returned sys_info object.
+  //    The result is either non-existent or unique in the next sys_info.
+  //
+  // There is no specification where the "middle" starts. Similar issues can
+  // happen when sys_info objects are "short", then "unique in the next" could
+  // become "ambiguous in the next and the one following". Theoretically there
+  // is the option of the following time-line
+  //
+  // |------------|
+  //           |----|
+  //       |-----------------|
+  //
+  // However the local_info object only has 2 sys_info objects, so this option
+  // is not tested.
+
+  sys_seconds __sys_time{__local_seconds - __info.offset};
+  if (__sys_time < __info.begin)
+    // Case 1 before __info
+    return chrono::__get_info(__local_time, __get_info(__info.begin - 1s), __info);
+
+  if (__sys_time >= __info.end)
+    // Case 3 after __info
+    return chrono::__get_info(__local_time, __info, __get_info(__info.end));
+
+  // Case 2 in __info
+  if (__info.begin != sys_seconds::min()) {
+    // Case 2.1 Not at the beginning, when not ambiguous the result should test
+    // case 2.3.
+    sys_info __prev = __get_info(__info.begin - 1s);
+    if (__is_ambiguous(__local_time, __prev, __info))
+      return {local_info::ambiguous, __prev, __info};
+  }
+
+  if (__info.end == sys_seconds::max())
+    // At the end so it's case 2.2
+    return {local_info::unique, __info, sys_info{}};
+
+  // This tests case 2.2 or case 2.3.
+  return chrono::__get_info(__local_time, __info, __get_info(__info.end));
+}
+
 } // namespace chrono
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp
index a5ce5d1..6fed41b 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp
@@ -48,8 +48,13 @@ void test() {
 
   {
     std::chrono::sys_seconds s{};
+    std::chrono::local_seconds l{};
+    std::chrono::choose z = std::chrono::choose::earliest;
     tz.name();           // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
     tz.get_info(s);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    tz.get_info(l);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    tz.to_sys(l);        // not nodiscard
+    tz.to_sys(l, z);     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
     operator==(tz, tz);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
     operator<=>(tz, tz); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   }
@@ -67,4 +72,10 @@ void test() {
     leap.date();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
     leap.value(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   }
+
+  {
+    using t = std::chrono::zoned_traits<const std::chrono::time_zone*>;
+    t::default_zone();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    t::locate_zone(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  }
 }
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.ambig/assert.ctor.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.ambig/assert.ctor.pass.cpp
new file mode 100644
index 0000000..73e6bf2
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.ambig/assert.ctor.pass.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+
+// <chrono>
+
+//  class ambiguous_local_time
+//
+//  template<class Duration>
+//    ambiguous_local_time(const local_time<Duration>& tp, const local_info& i);
+
+#include <chrono>
+
+#include "check_assertion.h"
+
+// [time.zone.exception.ambig]/2
+//   Preconditions: i.result == local_info::ambiguous is true.
+int main(int, char**) {
+  TEST_LIBCPP_ASSERT_FAILURE(
+      (std::chrono::ambiguous_local_time{
+          std::chrono::local_seconds{},
+          std::chrono::local_info{-1, //  this is not one of the "named" result values
+                                  std::chrono::sys_info{},
+                                  std::chrono::sys_info{}}}),
+      "creating an ambiguous_local_time from a local_info that is not ambiguous");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      (std::chrono::ambiguous_local_time{
+          std::chrono::local_seconds{},
+          std::chrono::local_info{std::chrono::local_info::unique, std::chrono::sys_info{}, std::chrono::sys_info{}}}),
+      "creating an ambiguous_local_time from a local_info that is not ambiguous");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      (std::chrono::ambiguous_local_time{
+          std::chrono::local_seconds{},
+          std::chrono::local_info{
+              std::chrono::local_info::nonexistent, std::chrono::sys_info{}, std::chrono::sys_info{}}}),
+      "creating an ambiguous_local_time from a local_info that is not ambiguous");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.nonexist/assert.ctor.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.nonexist/assert.ctor.pass.cpp
new file mode 100644
index 0000000..fdd9f79
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.nonexist/assert.ctor.pass.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+
+// <chrono>
+
+//  class nonexistent_local_time
+//
+//  template<class Duration>
+//    nonexistent_local_time(const local_time<Duration>& tp, const local_info& i);
+
+#include <chrono>
+
+#include "check_assertion.h"
+
+// [time.zone.exception.nonexist]/2
+//   Preconditions: i.result == local_info::nonexistent is true.
+int main(int, char**) {
+  TEST_LIBCPP_ASSERT_FAILURE(
+      (std::chrono::nonexistent_local_time{
+          std::chrono::local_seconds{},
+          std::chrono::local_info{-1, //  this is not one of the "named" result values
+                                  std::chrono::sys_info{},
+                                  std::chrono::sys_info{}}}),
+      "creating an nonexistent_local_time from a local_info that is not non-existent");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      (std::chrono::nonexistent_local_time{
+          std::chrono::local_seconds{},
+          std::chrono::local_info{std::chrono::local_info::unique, std::chrono::sys_info{}, std::chrono::sys_info{}}}),
+      "creating an nonexistent_local_time from a local_info that is not non-existent");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      (std::chrono::nonexistent_local_time{
+          std::chrono::local_seconds{},
+          std::chrono::local_info{
+              std::chrono::local_info::ambiguous, std::chrono::sys_info{}, std::chrono::sys_info{}}}),
+      "creating an nonexistent_local_time from a local_info that is not non-existent");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/choose.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/choose.pass.cpp
new file mode 100644
index 0000000..23ef9c8
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/choose.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// enum class choose;
+
+#include <chrono>
+#include <type_traits>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  using E = std::chrono::choose;
+  static_assert(std::is_enum_v<E>);
+
+  // Check that E is a scoped enum by checking for conversions.
+  using UT = std::underlying_type_t<E>;
+  static_assert(!std::is_convertible_v<E, UT>);
+
+  [[maybe_unused]] const E& early = E::earliest;
+  [[maybe_unused]] const E& late  = E::latest;
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_local.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_local.pass.cpp
new file mode 100644
index 0000000..d9ca1c8
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_local.pass.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// template <class _Duration>
+// local_time<common_type_t<Duration, seconds>>
+//   to_local(const sys_time<Duration>& tp) const;
+
+#include <chrono>
+
+#include "check_assertion.h"
+
+// Tests values that cannot be converted. To make sure the test is does not depend on changes
+// in the database it uses a time zone with a fixed offset.
+int main(int, char**) {
+  TEST_LIBCPP_ASSERT_FAILURE(std::chrono::locate_zone("Etc/GMT+1")->to_local(std::chrono::sys_seconds::min()),
+                             "cannot convert the system time; it would be before the minimum local clock value");
+
+  // TODO TZDB look why std::chrono::sys_seconds::max()  fails
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::chrono::locate_zone("Etc/GMT-1")->to_local(std::chrono::sys_seconds::max() - std::chrono::seconds(1)),
+      "cannot convert the system time; it would be after the maximum local clock value");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys.pass.cpp
new file mode 100644
index 0000000..3a2ff00
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys.pass.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+
+// <chrono>
+
+// template <class _Duration>
+//   sys_time<common_type_t<Duration, seconds>>
+//     to_sys(const local_time<Duration>& tp) const;
+
+#include <chrono>
+
+#include "check_assertion.h"
+
+// Tests values that cannot be converted. To make sure the test is does not depend on changes
+// in the database it uses a time zone with a fixed offset.
+int main(int, char**) {
+  TEST_LIBCPP_ASSERT_FAILURE(std::chrono::locate_zone("Etc/GMT-1")->to_sys(std::chrono::local_seconds::min()),
+                             "cannot convert the local time; it would be before the minimum system clock value");
+
+  // TODO TZDB look why std::chrono::local_seconds::max()  fails
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::chrono::locate_zone("Etc/GMT+1")->to_sys(std::chrono::local_seconds::max() - std::chrono::seconds(1)),
+      "cannot convert the local time; it would be after the maximum system clock value");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys_choose.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys_choose.pass.cpp
new file mode 100644
index 0000000..6542934
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys_choose.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+
+// <chrono>
+
+// template <class _Duration>
+//   sys_time<common_type_t<Duration, seconds>>
+//     to_sys(const local_time<Duration>& tp, choose z) const;
+
+#include <chrono>
+
+#include "check_assertion.h"
+
+// Tests values that cannot be converted. To make sure the test is does not depend on changes
+// in the database it uses a time zone with a fixed offset.
+int main(int, char**) {
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::chrono::locate_zone("Etc/GMT-1")->to_sys(std::chrono::local_seconds::min(), std::chrono::choose::earliest),
+      "cannot convert the local time; it would be before the minimum system clock value");
+
+  // TODO TZDB look why std::chrono::local_seconds::max()  fails
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::chrono::locate_zone("Etc/GMT+1")
+          ->to_sys(std::chrono::local_seconds::max() - std::chrono::seconds(1), std::chrono::choose::latest),
+      "cannot convert the local time; it would be after the maximum system clock value");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 92601fa..b0431d9 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -187,9 +187,6 @@ condition_variable type_traits
 condition_variable typeinfo
 condition_variable version
 coroutine compare
-coroutine cstddef
-coroutine cstdint
-coroutine cstring
 coroutine iosfwd
 coroutine limits
 coroutine type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index c05eb42..6fc8fe5 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -188,9 +188,6 @@ condition_variable type_traits
 condition_variable typeinfo
 condition_variable version
 coroutine compare
-coroutine cstddef
-coroutine cstdint
-coroutine cstring
 coroutine iosfwd
 coroutine limits
 coroutine type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 09252b7..5771e2b 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -189,9 +189,6 @@ condition_variable type_traits
 condition_variable typeinfo
 condition_variable version
 coroutine compare
-coroutine cstddef
-coroutine cstdint
-coroutine cstring
 coroutine iosfwd
 coroutine limits
 coroutine type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 09252b7..5771e2b 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -189,9 +189,6 @@ condition_variable type_traits
 condition_variable typeinfo
 condition_variable version
 coroutine compare
-coroutine cstddef
-coroutine cstdint
-coroutine cstring
 coroutine iosfwd
 coroutine limits
 coroutine type_traits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index ce4ccc3..406665a 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -111,26 +111,19 @@ charconv limits
 charconv new
 charconv type_traits
 charconv version
-chrono array
 chrono bit
-chrono cctype
-chrono cerrno
 chrono charconv
-chrono clocale
 chrono cmath
 chrono compare
 chrono concepts
 chrono cstddef
 chrono cstdint
-chrono cstdlib
 chrono cstring
 chrono ctime
-chrono cwchar
+chrono format
 chrono forward_list
 chrono limits
 chrono locale
-chrono new
-chrono optional
 chrono ostream
 chrono ratio
 chrono sstream
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 62d931c..05c7a6e 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -67,30 +67,20 @@ charconv initializer_list
 charconv limits
 charconv new
 charconv version
-chrono array
-chrono cctype
-chrono cerrno
-chrono clocale
 chrono cmath
 chrono compare
 chrono cstddef
 chrono cstdint
-chrono cstdlib
-chrono cstring
 chrono ctime
-chrono cwchar
+chrono format
 chrono forward_list
 chrono limits
-chrono new
-chrono optional
 chrono ostream
 chrono ratio
 chrono sstream
 chrono stdexcept
 chrono string
 chrono string_view
-chrono tuple
-chrono typeinfo
 chrono vector
 chrono version
 cinttypes cstdint
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index f68249a..05c7a6e 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -67,30 +67,20 @@ charconv initializer_list
 charconv limits
 charconv new
 charconv version
-chrono array
-chrono cctype
-chrono cerrno
-chrono clocale
 chrono cmath
 chrono compare
 chrono cstddef
 chrono cstdint
-chrono cstdlib
-chrono cstring
 chrono ctime
-chrono cwchar
+chrono format
 chrono forward_list
 chrono limits
-chrono new
-chrono optional
 chrono ostream
 chrono ratio
 chrono sstream
 chrono stdexcept
 chrono string
 chrono string_view
-chrono tuple
-chrono typeinfo
 chrono vector
 chrono version
 cinttypes cstdint
@@ -176,29 +166,6 @@ experimental/simd limits
 experimental/type_traits initializer_list
 experimental/type_traits type_traits
 experimental/utility utility
-experimental/vector experimental/memory_resource
-experimental/vector vector
-ext/hash_map algorithm
-ext/hash_map cmath
-ext/hash_map cstddef
-ext/hash_map cstdint
-ext/hash_map cstring
-ext/hash_map functional
-ext/hash_map initializer_list
-ext/hash_map limits
-ext/hash_map new
-ext/hash_map stdexcept
-ext/hash_map string
-ext/hash_set algorithm
-ext/hash_set cmath
-ext/hash_set cstddef
-ext/hash_set cstdint
-ext/hash_set cstring
-ext/hash_set functional
-ext/hash_set initializer_list
-ext/hash_set limits
-ext/hash_set new
-ext/hash_set string
 filesystem compare
 filesystem cstddef
 filesystem cstdint
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
index b278419..7a8d096 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
@@ -19,9 +19,9 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    std::shared_timed_mutex m;
+int main(int, char**) {
+  std::shared_timed_mutex m;
+  (void)m;
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
index d13a0ad..c7f2073 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp
@@ -18,9 +18,9 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    std::timed_mutex m;
+int main(int, char**) {
+  std::timed_mutex m;
+  (void)m;
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/default.pass.cpp
index 73e2e7a..3096e03 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/default.pass.cpp
@@ -18,9 +18,9 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    std::recursive_timed_mutex m;
+int main(int, char**) {
+  std::recursive_timed_mutex m;
+  (void)m;
 
   return 0;
 }
diff --git a/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.ambig/ctor.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.ambig/ctor.pass.cpp
new file mode 100644
index 0000000..a0b1416
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.ambig/ctor.pass.cpp
@@ -0,0 +1,171 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+//  class ambiguous_local_time
+//
+//  template<class Duration>
+//    ambiguous_local_time(const local_time<Duration>& tp, const local_info& i);
+
+#include <chrono>
+#include <string_view>
+
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+template <class Duration>
+static void
+test(const std::chrono::local_time<Duration>& tp, const std::chrono::local_info& i, std::string_view expected) {
+  std::chrono::ambiguous_local_time exception{tp, i};
+  std::string_view result = exception.what();
+  TEST_REQUIRE(result == expected,
+               TEST_WRITE_CONCATENATED("Expected output\n", expected, "\n\nActual output\n", result, '\n'));
+}
+
+// The constructor constructs the runtime_error base class with a specific
+// message. This implicitly tests what() too, since that is inherited from
+// runtime_error there is no separate test for what().
+int main(int, char**) {
+  using namespace std::literals::chrono_literals;
+
+  // There is no requirement on the ordering of PREV and NEXT so an "invalid"
+  // overlap is allowed. All tests with negative dates use the same order as
+  // positive tests.
+
+  test(std::chrono::local_time<std::chrono::nanoseconds>{-1ns},
+       std::chrono::local_info{
+           std::chrono::local_info::ambiguous,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970},
+               1h,
+               60min,
+               "PREV"},
+
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969} + 23h,
+               0s,
+               0min,
+               "NEXT"}
+
+       },
+       R"(1969-12-31 23:59:59.999999999 is ambiguous.  It could be
+1969-12-31 23:59:59.999999999 PREV == 1969-12-31 22:59:59.999999999 UTC or
+1969-12-31 23:59:59.999999999 NEXT == 1969-12-31 23:59:59.999999999 UTC)");
+
+  test(std::chrono::local_time<std::chrono::microseconds>{0us},
+       std::chrono::local_info{
+           std::chrono::local_info::ambiguous,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970},
+               1h,
+               60min,
+               "PREV"},
+
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969} + 23h,
+               0s,
+               0min,
+               "NEXT"}},
+       R"(1970-01-01 00:00:00.000000 is ambiguous.  It could be
+1970-01-01 00:00:00.000000 PREV == 1969-12-31 23:00:00.000000 UTC or
+1970-01-01 00:00:00.000000 NEXT == 1970-01-01 00:00:00.000000 UTC)");
+
+  test(std::chrono::local_time<std::chrono::milliseconds>{1ms},
+       std::chrono::local_info{
+           std::chrono::local_info::ambiguous,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970},
+               1h,
+               60min,
+               "PREV"},
+
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969} + 23h,
+               0s,
+               0min,
+               "NEXT"}},
+       R"(1970-01-01 00:00:00.001 is ambiguous.  It could be
+1970-01-01 00:00:00.001 PREV == 1969-12-31 23:00:00.001 UTC or
+1970-01-01 00:00:00.001 NEXT == 1970-01-01 00:00:00.001 UTC)");
+
+  test(std::chrono::local_seconds{(std::chrono::sys_days{std::chrono::January / 1 / -21970}).time_since_epoch()},
+       std::chrono::local_info{
+           std::chrono::local_info::ambiguous,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / -21969},
+               std::chrono::sys_days{std::chrono::December / 31 / -21969},
+               0s,
+               0min,
+               "PREV"},
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / -21970},
+               std::chrono::sys_days{std::chrono::March / 1 / -21970} + 23h,
+               1h,
+               60min,
+               "NEXT"}},
+       R"(-21970-01-01 00:00:00 is ambiguous.  It could be
+-21970-01-01 00:00:00 PREV == -21970-01-01 00:00:00 UTC or
+-21970-01-01 00:00:00 NEXT == -21971-12-31 23:00:00 UTC)");
+
+  test(
+      std::chrono::local_time<std::chrono::days>{
+          (std::chrono::sys_days{std::chrono::January / 1 / 21970}).time_since_epoch()},
+      std::chrono::local_info{
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info{
+              std::chrono::sys_days{std::chrono::September / 1 / 21969},
+              std::chrono::sys_days{std::chrono::December / 31 / 21969},
+              0s,
+              0min,
+              "PREV"},
+          std::chrono::sys_info{
+              std::chrono::sys_days{std::chrono::January / 1 / 21970},
+              std::chrono::sys_days{std::chrono::March / 1 / 21970} + 23h,
+              1h,
+              60min,
+              "NEXT"}},
+      R"(21970-01-01 is ambiguous.  It could be
+21970-01-01 PREV == 21970-01-01 00:00:00 UTC or
+21970-01-01 NEXT == 21969-12-31 23:00:00 UTC)");
+
+  test(std::chrono::local_time<std::chrono::weeks>{},
+       std::chrono::local_info{
+           std::chrono::local_info::ambiguous,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969},
+               0s,
+               0min,
+               "PREV"},
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970} + 23h,
+               1h,
+               60min,
+               "NEXT"}},
+       R"(1970-01-01 is ambiguous.  It could be
+1970-01-01 PREV == 1970-01-01 00:00:00 UTC or
+1970-01-01 NEXT == 1969-12-31 23:00:00 UTC)");
+
+  // Note months and years can not be streamed.
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.ambig/types.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.ambig/types.pass.cpp
new file mode 100644
index 0000000..be54aed
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.ambig/types.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+// ADDITIONAL_COMPILE_FLAGS(clang): -Wno-deprecated
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-deprecated-copy-dtor
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class ambiguous_local_time : public runtime_error {
+// public:
+//   template<class Duration>
+//     ambiguous_local_time(const local_time<Duration>& tp, const local_info& i);
+// };
+
+#include <chrono>
+#include <stdexcept>
+#include <type_traits>
+
+// Basic properties
+static_assert(std::is_base_of_v<std::runtime_error, std::chrono::ambiguous_local_time>);
+static_assert(!std::is_default_constructible_v<std::chrono::ambiguous_local_time>);
+static_assert(std::is_destructible_v<std::chrono::ambiguous_local_time>);
+static_assert(std::is_copy_constructible_v<std::chrono::ambiguous_local_time>);
+static_assert(std::is_move_constructible_v<std::chrono::ambiguous_local_time>);
+static_assert(std::is_copy_assignable_v<std::chrono::ambiguous_local_time>);
+static_assert(std::is_move_assignable_v<std::chrono::ambiguous_local_time>);
+
+int main(int, char**) {
+  std::chrono::ambiguous_local_time e{
+      std::chrono::local_seconds{}, std::chrono::local_info{std::chrono::local_info::ambiguous, {}, {}}};
+
+  std::chrono::ambiguous_local_time copy = e;
+  copy                                   = e;
+
+  std::chrono::ambiguous_local_time move = std::move(e);
+  e                                      = move;
+  move                                   = std::move(e);
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.nonexist/ctor.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.nonexist/ctor.pass.cpp
new file mode 100644
index 0000000..ca03d83
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.nonexist/ctor.pass.cpp
@@ -0,0 +1,172 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class nonexistent_local_time
+//
+// template<class Duration>
+// nonexistent_local_time(const local_time<Duration>& tp, const local_info& i);
+
+#include <chrono>
+#include <string_view>
+
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+template <class Duration>
+static void
+test(const std::chrono::local_time<Duration>& tp, const std::chrono::local_info& i, std::string_view expected) {
+  std::chrono::nonexistent_local_time exception{tp, i};
+  std::string_view result = exception.what();
+  TEST_REQUIRE(result == expected,
+               TEST_WRITE_CONCATENATED("Expected output\n", expected, "\n\nActual output\n", result, '\n'));
+}
+
+// The constructor constructs the runtime_error base class with a specific
+// message. This implicitly tests what() too, since that is inherited from
+// runtime_error there is no separate test for what().
+int main(int, char**) {
+  using namespace std::literals::chrono_literals;
+
+  // There is no requirement on the ordering of PREV and NEXT so an "invalid"
+  // gap is allowed. All tests with negative dates use the same order as
+  // positive tests.
+
+  test(std::chrono::local_time<std::chrono::nanoseconds>{-1ns},
+       std::chrono::local_info{
+           std::chrono::local_info::nonexistent,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969} + 23h,
+               0s,
+               0min,
+               "PREV"},
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970},
+               1h,
+               60min,
+               "NEXT"}},
+       R"(1969-12-31 23:59:59.999999999 is in a gap between
+1969-12-31 23:00:00 PREV and
+1970-01-01 01:00:00 NEXT which are both equivalent to
+1969-12-31 23:00:00 UTC)");
+
+  test(std::chrono::local_time<std::chrono::microseconds>{0us},
+       std::chrono::local_info{
+           std::chrono::local_info::nonexistent,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969} + 23h,
+               0s,
+               0min,
+               "PREV"},
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970},
+               1h,
+               60min,
+               "NEXT"}},
+       R"(1970-01-01 00:00:00.000000 is in a gap between
+1969-12-31 23:00:00 PREV and
+1970-01-01 01:00:00 NEXT which are both equivalent to
+1969-12-31 23:00:00 UTC)");
+
+  test(std::chrono::local_time<std::chrono::milliseconds>{1ms},
+       std::chrono::local_info{
+           std::chrono::local_info::nonexistent,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969} + 23h,
+               0s,
+               0min,
+               "PREV"},
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970},
+               1h,
+               60min,
+               "NEXT"}},
+       R"(1970-01-01 00:00:00.001 is in a gap between
+1969-12-31 23:00:00 PREV and
+1970-01-01 01:00:00 NEXT which are both equivalent to
+1969-12-31 23:00:00 UTC)");
+
+  test(std::chrono::local_seconds{(std::chrono::sys_days{std::chrono::January / 1 / -21970}).time_since_epoch()},
+       std::chrono::local_info{
+           std::chrono::local_info::nonexistent,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / -21969},
+               std::chrono::sys_days{std::chrono::December / 31 / -21969} + 23h,
+               0s,
+               0min,
+               "PREV"},
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / -21970},
+               std::chrono::sys_days{std::chrono::March / 1 / -21970},
+               1h,
+               60min,
+               "NEXT"}},
+       R"(-21970-01-01 00:00:00 is in a gap between
+-21969-12-31 23:00:00 PREV and
+-21970-01-01 01:00:00 NEXT which are both equivalent to
+-21969-12-31 23:00:00 UTC)");
+
+  test(
+      std::chrono::local_time<std::chrono::days>{
+          (std::chrono::sys_days{std::chrono::January / 1 / 21970}).time_since_epoch()},
+      std::chrono::local_info{
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info{
+              std::chrono::sys_days{std::chrono::September / 1 / 21969},
+              std::chrono::sys_days{std::chrono::December / 31 / 21969} + 23h,
+              0s,
+              0min,
+              "PREV"},
+          std::chrono::sys_info{
+              std::chrono::sys_days{std::chrono::January / 1 / 21970},
+              std::chrono::sys_days{std::chrono::March / 1 / 21970},
+              1h,
+              60min,
+              "NEXT"}},
+      R"(21970-01-01 is in a gap between
+21969-12-31 23:00:00 PREV and
+21970-01-01 01:00:00 NEXT which are both equivalent to
+21969-12-31 23:00:00 UTC)");
+
+  test(std::chrono::local_time<std::chrono::weeks>{},
+       std::chrono::local_info{
+           std::chrono::local_info::nonexistent,
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::September / 1 / 1969},
+               std::chrono::sys_days{std::chrono::December / 31 / 1969} + 23h,
+               0s,
+               0min,
+               "PREV"},
+           std::chrono::sys_info{
+               std::chrono::sys_days{std::chrono::January / 1 / 1970},
+               std::chrono::sys_days{std::chrono::March / 1 / 1970},
+               1h,
+               60min,
+               "NEXT"}},
+       R"(1970-01-01 is in a gap between
+1969-12-31 23:00:00 PREV and
+1970-01-01 01:00:00 NEXT which are both equivalent to
+1969-12-31 23:00:00 UTC)");
+
+  // Note months and years can not be streamed.
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.nonexist/types.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.nonexist/types.pass.cpp
new file mode 100644
index 0000000..85ebfab
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.exception/time.zone.exception.nonexist/types.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+// ADDITIONAL_COMPILE_FLAGS(clang): -Wno-deprecated
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-deprecated-copy-dtor
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class nonexistent_local_time : public runtime_error {
+// public:
+//   template<class Duration>
+//     nonexistent_local_time(const local_time<Duration>& tp, const local_info& i);
+// };
+
+#include <chrono>
+#include <stdexcept>
+#include <type_traits>
+
+// Basic properties
+static_assert(std::is_base_of_v<std::runtime_error, std::chrono::nonexistent_local_time>);
+static_assert(!std::is_default_constructible_v<std::chrono::nonexistent_local_time>);
+static_assert(std::is_destructible_v<std::chrono::nonexistent_local_time>);
+static_assert(std::is_copy_constructible_v<std::chrono::nonexistent_local_time>);
+static_assert(std::is_move_constructible_v<std::chrono::nonexistent_local_time>);
+static_assert(std::is_copy_assignable_v<std::chrono::nonexistent_local_time>);
+static_assert(std::is_move_assignable_v<std::chrono::nonexistent_local_time>);
+
+int main(int, char**) {
+  std::chrono::nonexistent_local_time e{
+      std::chrono::local_seconds{}, std::chrono::local_info{std::chrono::local_info::nonexistent, {}, {}}};
+
+  std::chrono::nonexistent_local_time copy = e;
+  copy                                     = e;
+
+  std::chrono::nonexistent_local_time move = std::move(e);
+  e                                        = move;
+  move                                     = std::move(e);
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.local_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.local_time.pass.cpp
new file mode 100644
index 0000000..a8c468a
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.local_time.pass.cpp
@@ -0,0 +1,1304 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+// Times out under HWASan
+// XFAIL: hwasan
+
+// <chrono>
+
+// class time_zone;
+
+// template <class _Duration>
+//   local_info get_info(const local_time<_Duration>& time) const;
+
+// This test uses the system provided database. This makes the test portable,
+// but may cause failures when the database information changes. Historic data
+// may change if new facts are uncovered, future data may change when regions
+// change their time zone or daylight saving time. Most tests will not look in
+// the future to attempt to avoid issues. All tests list the data on which they
+// are based, this makes debugging easier upon failure; including to see whether
+// the provided data has not been changed.
+//
+// The first part of the test is manually crafted, the second part compares the
+// transitions for all time zones in the database.
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <format>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+// The year range to validate. The dates used in practice are expected to be
+// inside the tested range.
+constexpr std::chrono::year first{1800};
+constexpr std::chrono::year last{2100};
+
+/***** ***** HELPERS ***** *****/
+
+[[nodiscard]] static std::chrono::sys_seconds to_sys_seconds(
+    std::chrono::year year,
+    std::chrono::month month,
+    std::chrono::day day,
+    std::chrono::hours h   = std::chrono::hours(0),
+    std::chrono::minutes m = std::chrono::minutes{0},
+    std::chrono::seconds s = std::chrono::seconds{0}) {
+  std::chrono::year_month_day result{year, month, day};
+
+  return std::chrono::time_point_cast<std::chrono::seconds>(static_cast<std::chrono::sys_days>(result)) + h + m + s;
+}
+
+[[nodiscard]] static std::chrono::local_seconds to_local_seconds(
+    std::chrono::year year,
+    std::chrono::month month,
+    std::chrono::day day,
+    std::chrono::hours h   = std::chrono::hours(0),
+    std::chrono::minutes m = std::chrono::minutes{0},
+    std::chrono::seconds s = std::chrono::seconds{0}) {
+  std::chrono::year_month_day result{year, month, day};
+
+  return std::chrono::time_point_cast<std::chrono::seconds>(static_cast<std::chrono::local_days>(result)) + h + m + s;
+}
+
+static void assert_equal(const std::chrono::sys_info& lhs, const std::chrono::sys_info& rhs) {
+  TEST_REQUIRE(lhs.begin == rhs.begin,
+               TEST_WRITE_CONCATENATED("\nBegin:\nExpected output ", lhs.begin, "\nActual output   ", rhs.begin, '\n'));
+  TEST_REQUIRE(lhs.end == rhs.end,
+               TEST_WRITE_CONCATENATED("\nEnd:\nExpected output ", lhs.end, "\nActual output   ", rhs.end, '\n'));
+  TEST_REQUIRE(
+      lhs.offset == rhs.offset,
+      TEST_WRITE_CONCATENATED("\nOffset:\nExpected output ", lhs.offset, "\nActual output   ", rhs.offset, '\n'));
+  TEST_REQUIRE(lhs.save == rhs.save,
+               TEST_WRITE_CONCATENATED("\nSave:\nExpected output ", lhs.save, "\nActual output   ", rhs.save, '\n'));
+  TEST_REQUIRE(
+      lhs.abbrev == rhs.abbrev,
+      TEST_WRITE_CONCATENATED("\nAbbrev:\nExpected output ", lhs.abbrev, "\nActual output   ", rhs.abbrev, '\n'));
+}
+
+static void assert_equal(const std::chrono::local_info& lhs, const std::chrono::local_info& rhs) {
+  TEST_REQUIRE(
+      lhs.result == rhs.result,
+      TEST_WRITE_CONCATENATED("\nResult:\nExpected output ", lhs.result, "\nActual output   ", rhs.result, '\n'));
+
+  assert_equal(lhs.first, rhs.first);
+  assert_equal(lhs.second, rhs.second);
+}
+
+/***** ***** TESTS ***** *****/
+
+static void test_gmt() {
+  // Simple zone always valid, no rule entries, lookup using a link.
+  // L Etc/GMT GMT
+  // Z Etc/GMT 0 - GMT
+
+  using namespace std::literals::chrono_literals;
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("GMT");
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(std::chrono::sys_seconds::min(), std::chrono::sys_seconds::max(), 0s, 0min, "GMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min()));
+}
+
+static void test_local_time_out_of_range() {
+  // Fixed positive offset
+  // Etc/GMT-1 1 - +01
+
+  using namespace std::literals::chrono_literals;
+  { // lower bound
+    const std::chrono::time_zone* tz = std::chrono::locate_zone("Etc/GMT-1");
+
+    assert_equal(
+        std::chrono::local_info(
+            -1,
+            std::chrono::sys_info(std::chrono::sys_seconds::min(), std::chrono::sys_seconds::max(), 1h, 0min, "+01"),
+            std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+        tz->get_info(std::chrono::local_seconds::min()));
+
+    assert_equal(
+        std::chrono::local_info(
+            -1,
+            std::chrono::sys_info(std::chrono::sys_seconds::min(), std::chrono::sys_seconds::max(), 1h, 0min, "+01"),
+            std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+        tz->get_info(std::chrono::local_seconds::min() + 59min + 59s));
+
+    assert_equal(
+        std::chrono::local_info(
+            std::chrono::local_info::unique,
+            std::chrono::sys_info(std::chrono::sys_seconds::min(), std::chrono::sys_seconds::max(), 1h, 0min, "+01"),
+            std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+        tz->get_info(std::chrono::local_seconds::min() + 1h));
+  }
+
+  { // upper bound
+    const std::chrono::time_zone* tz = std::chrono::locate_zone("Etc/GMT+1");
+
+    assert_equal(
+        std::chrono::local_info(
+            -2,
+            std::chrono::sys_info(std::chrono::sys_seconds::min(), std::chrono::sys_seconds::max(), -1h, 0min, "-01"),
+            std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+        tz->get_info(std::chrono::local_seconds::max() - 1s));
+
+    assert_equal(
+        std::chrono::local_info(
+            std::chrono::local_info::unique,
+            std::chrono::sys_info(std::chrono::sys_seconds::min(), std::chrono::sys_seconds::max(), -1h, 0min, "-01"),
+            std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+        tz->get_info(std::chrono::local_seconds::max() - 1h - 1s));
+  }
+}
+
+static void test_indian_kerguelen() {
+  // One change, no rules, no dst changes.
+
+  // Z Indian/Kerguelen 0 - -00 1950
+  // 5 - +05
+
+  using namespace std::literals::chrono_literals;
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Indian/Kerguelen");
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(), to_sys_seconds(1950y, std::chrono::January, 1d), 0s, 0min, "-00"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min()));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(), to_sys_seconds(1950y, std::chrono::January, 1d), 0s, 0min, "-00"),
+          std::chrono::sys_info(
+              to_sys_seconds(1950y, std::chrono::January, 1d), std::chrono::sys_seconds::max(), 5h, 0min, "+05")),
+      tz->get_info(to_local_seconds(1950y, std::chrono::January, 1d)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1950y, std::chrono::January, 1d), std::chrono::sys_seconds::max(), 5h, 0min, "+05"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1950y, std::chrono::January, 1d, 5h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1950y, std::chrono::January, 1d), std::chrono::sys_seconds::max(), 5h, 0min, "+05"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::max() - 1s));
+}
+
+static void test_antarctica_rothera() {
+  // One change, no rules, no dst changes
+
+  // Z Antarctica/Rothera 0 - -00 1976 D
+  // -3 - -03
+
+  using namespace std::literals::chrono_literals;
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Antarctica/Rothera");
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(), to_sys_seconds(1976y, std::chrono::December, 1d), 0s, 0min, "-00"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min()));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(), to_sys_seconds(1976y, std::chrono::December, 1d), 0s, 0min, "-00"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1976y, std::chrono::November, 30d, 20h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(), to_sys_seconds(1976y, std::chrono::December, 1d), 0s, 0min, "-00"),
+          std::chrono::sys_info(
+              to_sys_seconds(1976y, std::chrono::December, 1d), std::chrono::sys_seconds::max(), -3h, 0min, "-03")),
+      tz->get_info(to_local_seconds(1976y, std::chrono::November, 30d, 21h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(), to_sys_seconds(1976y, std::chrono::December, 1d), 0s, 0min, "-00"),
+          std::chrono::sys_info(
+              to_sys_seconds(1976y, std::chrono::December, 1d), std::chrono::sys_seconds::max(), -3h, 0min, "-03")),
+      tz->get_info(to_local_seconds(1976y, std::chrono::November, 30d, 23h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1976y, std::chrono::December, 1d), std::chrono::sys_seconds::max(), -3h, 0min, "-03"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1976y, std::chrono::December, 1d)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1976y, std::chrono::December, 1d), std::chrono::sys_seconds::max(), -3h, 0min, "-03"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::max() - 3h - 1s));
+
+  assert_equal(
+      std::chrono::local_info(
+          -2,
+          std::chrono::sys_info(
+              to_sys_seconds(1976y, std::chrono::December, 1d), std::chrono::sys_seconds::max(), -3h, 0min, "-03"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::max() - 1s));
+}
+
+static void test_asia_hong_kong() {
+  // A more typical entry, first some hard-coded entires and then at the
+  // end a rules based entry. This rule is valid for its entire period
+  //
+  // Z Asia/Hong_Kong 7:36:42 - LMT 1904 O 30 0:36:42
+  // 8 - HKT 1941 Jun 15 3
+  // 8 1 HKST 1941 O 1 4
+  // 8 0:30 HKWT 1941 D 25
+  // 9 - JST 1945 N 18 2
+  // 8 HK HK%sT
+  //
+  // R HK 1946 o - Ap 21 0 1 S
+  // R HK 1946 o - D 1 3:30s 0 -
+  // R HK 1947 o - Ap 13 3:30s 1 S
+  // R HK 1947 o - N 30 3:30s 0 -
+  // R HK 1948 o - May 2 3:30s 1 S
+  // R HK 1948 1952 - O Su>=28 3:30s 0 -
+  // R HK 1949 1953 - Ap Su>=1 3:30 1 S
+  // R HK 1953 1964 - O Su>=31 3:30 0 -
+  // R HK 1954 1964 - Mar Su>=18 3:30 1 S
+  // R HK 1965 1976 - Ap Su>=16 3:30 1 S
+  // R HK 1965 1976 - O Su>=16 3:30 0 -
+  // R HK 1973 o - D 30 3:30 1 S
+  // R HK 1979 o - May 13 3:30 1 S
+  // R HK 1979 o - O 21 3:30 0 -
+
+  using namespace std::literals::chrono_literals;
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Asia/Hong_Kong");
+
+  assert_equal(
+      std::chrono::local_info(
+          -1,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              7h + 36min + 42s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min()));
+
+  assert_equal(
+      std::chrono::local_info(
+          -1,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              7h + 36min + 42s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min() + 7h + 36min + 41s));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              7h + 36min + 42s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min() + 7h + 36min + 42s));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              7h + 36min + 42s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1904y, std::chrono::October, 30d, 0h, 36min, 41s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              7h + 36min + 42s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              8h,
+              0min,
+              "HKT")),
+      tz->get_info(to_local_seconds(1904y, std::chrono::October, 30d, 0h, 36min, 42s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              7h + 36min + 42s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              8h,
+              0min,
+              "HKT")),
+      tz->get_info(to_local_seconds(1904y, std::chrono::October, 30d, 0h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              8h,
+              0min,
+              "HKT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1904y, std::chrono::October, 30d, 1h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              8h,
+              0min,
+              "HKT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::June, 15d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              8h,
+              0min,
+              "HKT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              9h,
+              60min,
+              "HKST")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::June, 15d, 3h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1904y, std::chrono::October, 29d, 17h),
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              8h,
+              0min,
+              "HKT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              9h,
+              60min,
+              "HKST")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::June, 15d, 3h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              9h,
+              60min,
+              "HKST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::June, 15d, 4h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              9h,
+              60min,
+              "HKST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::October, 1d, 3h, 29min, 29s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              9h,
+              60min,
+              "HKST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              to_sys_seconds(1941y, std::chrono::December, 24d, 15h, 30min),
+              8h + 30min,
+              30min,
+              "HKWT")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::October, 1d, 3h, 30min)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::June, 14d, 19h),
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              9h,
+              60min,
+              "HKST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              to_sys_seconds(1941y, std::chrono::December, 24d, 15h, 30min),
+              8h + 30min,
+              30min,
+              "HKWT")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::October, 1d, 3h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1941y, std::chrono::September, 30d, 19h),
+              to_sys_seconds(1941y, std::chrono::December, 24d, 15h, 30min),
+              8h + 30min,
+              30min,
+              "HKWT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1941y, std::chrono::October, 1d, 4h)));
+}
+
+static void test_europe_berlin() {
+  // A more typical entry, first some hard-coded entires and then at the
+  // end a rules based entry. This rule is valid for its entire period
+  //
+
+  // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+  // 1 c CE%sT 1945 May 24 2
+  // 1 So CE%sT 1946
+  // 1 DE CE%sT 1980
+  // 1 E CE%sT
+  //
+  // R c 1916 o - Ap 30 23 1 S
+  // R c 1916 o - O 1 1 0 -
+  // R c 1917 1918 - Ap M>=15 2s 1 S
+  // R c 1917 1918 - S M>=15 2s 0 -
+  // R c 1940 o - Ap 1 2s 1 S
+  // R c 1942 o - N 2 2s 0 -
+  // R c 1943 o - Mar 29 2s 1 S
+  // R c 1943 o - O 4 2s 0 -
+  // R c 1944 1945 - Ap M>=1 2s 1 S
+  // R c 1944 o - O 2 2s 0 -
+  // R c 1945 o - S 16 2s 0 -
+  // R c 1977 1980 - Ap Su>=1 2s 1 S
+  // R c 1977 o - S lastSu 2s 0 -
+  // R c 1978 o - O 1 2s 0 -
+  // R c 1979 1995 - S lastSu 2s 0 -
+  // R c 1981 ma - Mar lastSu 2s 1 S
+  // R c 1996 ma - O lastSu 2s 0 -
+  //
+  // R So 1945 o - May 24 2 2 M
+  // R So 1945 o - S 24 3 1 S
+  // R So 1945 o - N 18 2s 0 -
+  //
+  // R DE 1946 o - Ap 14 2s 1 S
+  // R DE 1946 o - O 7 2s 0 -
+  // R DE 1947 1949 - O Su>=1 2s 0 -
+  // R DE 1947 o - Ap 6 3s 1 S
+  // R DE 1947 o - May 11 2s 2 M
+  // R DE 1947 o - Jun 29 3 1 S
+  // R DE 1948 o - Ap 18 2s 1 S
+  // R DE 1949 o - Ap 10 2s 1 S
+  //
+  // R E 1977 1980 - Ap Su>=1 1u 1 S
+  // R E 1977 o - S lastSu 1u 0 -
+  // R E 1978 o - O 1 1u 0 -
+  // R E 1979 1995 - S lastSu 1u 0 -
+  // R E 1981 ma - Mar lastSu 1u 1 S
+  // R E 1996 ma - O lastSu 1u 0 -
+  //
+  // Note the European Union decided to stop the seasonal change in
+  // 2021. In 2023 seasonal changes are still in effect.
+
+  using namespace std::literals::chrono_literals;
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Europe/Berlin");
+
+  assert_equal(
+      std::chrono::local_info(
+          -1,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1893y, std::chrono::March, 31d, 23h, 6min, 32s),
+              53min + 28s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min()));
+
+  assert_equal(
+      std::chrono::local_info(
+          -1,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1893y, std::chrono::March, 31d, 23h, 6min, 32s),
+              53min + 28s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min() + 53min + 27s));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1893y, std::chrono::March, 31d, 23h, 6min, 32s),
+              53min + 28s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min() + 53min + 28s));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1893y, std::chrono::March, 31d, 23h, 6min, 32s),
+              53min + 28s,
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1893y, std::chrono::March, 31d, 23h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1946y, std::chrono::October, 7d, 1h),
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              1h,
+              0min,
+              "CET"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::April, 6d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1946y, std::chrono::October, 7d, 1h),
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              1h,
+              0min,
+              "CET"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              2h,
+              60min,
+              "CEST")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::April, 6d, 3h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1946y, std::chrono::October, 7d, 1h),
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              1h,
+              0min,
+              "CET"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              2h,
+              60min,
+              "CEST")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::April, 6d, 3h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::April, 6d, 4h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::May, 11d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              3h,
+              120min,
+              "CEMT")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::May, 11d, 3h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::April, 6d, 2h),
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              3h,
+              120min,
+              "CEMT")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::May, 11d, 3h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              3h,
+              120min,
+              "CEMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::May, 11d, 4h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              3h,
+              120min,
+              "CEMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::June, 29d, 1h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              3h,
+              120min,
+              "CEMT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              2h,
+              60min,
+              "CEST")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::June, 29d, 2h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::May, 11d, 1h),
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              3h,
+              120min,
+              "CEMT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              2h,
+              60min,
+              "CEST")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::June, 29d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::June, 29d, 3h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::October, 5d, 1h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              to_sys_seconds(1948y, std::chrono::April, 18d, 1h),
+              1h,
+              0min,
+              "CET")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::October, 5d, 2h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::June, 29d),
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              2h,
+              60min,
+              "CEST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              to_sys_seconds(1948y, std::chrono::April, 18d, 1h),
+              1h,
+              0min,
+              "CET")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::October, 5d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1947y, std::chrono::October, 5d, 1h),
+              to_sys_seconds(1948y, std::chrono::April, 18d, 1h),
+              1h,
+              0min,
+              "CET"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1947y, std::chrono::October, 5d, 3h)));
+}
+
+static void test_europe_dublin() {
+  // Z Europe/Dublin -0:25:21 - LMT 1880 Au 2
+  // -0:25:21 - DMT 1916 May 21 2s
+  // -0:25:21 1 IST 1916 O 1 2s
+  // 0 G %s 1921 D 6
+  // ...
+  //
+  // R G 1916 o - May 21 2s 1 BST
+  // R G 1916 o - O 1 2s 0 GMT
+  // R G 1917 o - Ap 8 2s 1 BST
+  // ...
+
+  using namespace std::literals::chrono_literals;
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Europe/Dublin");
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1880y, std::chrono::August, 2d, 0h, 25min, 21s),
+              -(25min + 21s),
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min()));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1880y, std::chrono::August, 2d, 0h, 25min, 21s),
+              -(25min + 21s),
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1880y, std::chrono::August, 1d, 23h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1880y, std::chrono::August, 2d, 0h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              -(25min + 21s),
+              0min,
+              "DMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1880y, std::chrono::August, 2d)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1880y, std::chrono::August, 2d, 0h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              -(25min + 21s),
+              0min,
+              "DMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1916y, std::chrono::May, 21d, 1h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1880y, std::chrono::August, 2d, 0h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              -(25min + 21s),
+              0min,
+              "DMT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              34min + 39s,
+              60min,
+              "IST")),
+      tz->get_info(to_local_seconds(1916y, std::chrono::May, 21d, 2h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1880y, std::chrono::August, 2d, 0h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              -(25min + 21s),
+              0min,
+              "DMT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              34min + 39s,
+              60min,
+              "IST")),
+      tz->get_info(to_local_seconds(1916y, std::chrono::May, 21d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              34min + 39s,
+              60min,
+              "IST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1916y, std::chrono::May, 21d, 6h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              34min + 39s,
+              60min,
+              "IST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1916y, std::chrono::October, 1d, 2h, 25min, 20s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::May, 21d, 2h, 25min, 21s),
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              34min + 39s,
+              60min,
+              "IST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              to_sys_seconds(1917y, std::chrono::April, 8d, 2h),
+              0s,
+              0min,
+              "GMT")),
+      tz->get_info(to_local_seconds(1916y, std::chrono::October, 1d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              to_sys_seconds(1917y, std::chrono::April, 8d, 2h),
+              0s,
+              0min,
+              "GMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1916y, std::chrono::October, 1d, 3h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1916y, std::chrono::October, 1d, 02h, 25min, 21s),
+              to_sys_seconds(1917y, std::chrono::April, 8d, 2h),
+              0s,
+              0min,
+              "GMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::April, 8d, 1h, 59min, 59s)));
+}
+
+static void test_america_st_johns() {
+  // A more typical entry,
+  // Uses letters both when DST is ative and not and has multiple
+  // letters. Uses negetive offsets.
+  // Switches several times between their own and Canadian rules
+  // Switches the stdoff from -3:30:52 to -3:30 while observing the same rule
+
+  // Z America/St_Johns -3:30:52 - LMT 1884
+  // -3:30:52 j N%sT 1918
+  // -3:30:52 C N%sT 1919
+  // ...
+  //
+  // R j 1917 o - Ap 8 2 1 D
+  // R j 1917 o - S 17 2 0 S
+  // R j 1919 o - May 5 23 1 D
+  // R j 1919 o - Au 12 23 0 S
+  // R j 1920 1935 - May Su>=1 23 1 D
+  // ...
+  //
+  // R C 1918 o - Ap 14 2 1 D
+  // R C 1918 o - O 27 2 0 S
+  // R C 1942 o - F 9 2 1 W
+  // ...
+
+  using namespace std::literals::chrono_literals;
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("America/St_Johns");
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1884y, std::chrono::January, 1d, 3h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(std::chrono::local_seconds::min()));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              std::chrono::sys_seconds::min(),
+              to_sys_seconds(1884y, std::chrono::January, 1d, 3h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "LMT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1883y, std::chrono::December, 31d, 23h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1884y, std::chrono::January, 1d, 3h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "NST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1884y, std::chrono::January, 1d)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1884y, std::chrono::January, 1d, 3h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "NST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::April, 8d, 1h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1884y, std::chrono::January, 1d, 3h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "NST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              -(2h + 30min + 52s),
+              60min,
+              "NDT")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::April, 8d, 2h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::nonexistent,
+          std::chrono::sys_info(
+              to_sys_seconds(1884y, std::chrono::January, 1d, 3h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "NST"),
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              -(2h + 30min + 52s),
+              60min,
+              "NDT")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::April, 8d, 2h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              -(2h + 30min + 52s),
+              60min,
+              "NDT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::April, 8d, 3h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              -(2h + 30min + 52s),
+              60min,
+              "NDT"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::September, 17d, 0h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              -(2h + 30min + 52s),
+              60min,
+              "NDT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              to_sys_seconds(1918y, std::chrono::April, 14d, 5h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "NST")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::September, 17d, 1h)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::ambiguous,
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::April, 8d, 5h, 30min, 52s),
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              -(2h + 30min + 52s),
+              60min,
+              "NDT"),
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              to_sys_seconds(1918y, std::chrono::April, 14d, 5h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "NST")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::September, 17d, 1h, 59min, 59s)));
+
+  assert_equal(
+      std::chrono::local_info(
+          std::chrono::local_info::unique,
+          std::chrono::sys_info(
+              to_sys_seconds(1917y, std::chrono::September, 17d, 4h, 30min, 52s),
+              to_sys_seconds(1918y, std::chrono::April, 14d, 5h, 30min, 52s),
+              -(3h + 30min + 52s),
+              0min,
+              "NST"),
+          std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+      tz->get_info(to_local_seconds(1917y, std::chrono::September, 17d, 2h)));
+}
+
+static void validate_transitions(const std::chrono::time_zone& zone) {
+  using namespace std::literals::chrono_literals;
+
+  constexpr auto begin = std::chrono::time_point_cast<std::chrono::seconds>(
+      static_cast<std::chrono::sys_days>(std::chrono::year_month_day{first, std::chrono::January, 1d}));
+  constexpr auto end = std::chrono::time_point_cast<std::chrono::seconds>(
+      static_cast<std::chrono::sys_days>(std::chrono::year_month_day{last, std::chrono::January, 1d}));
+
+  // Builds the set of sys_info objects for the selected time range.
+  std::vector<std::chrono::sys_info> input;
+  std::chrono::sys_seconds s = begin;
+  do {
+    input.emplace_back(zone.get_info(s));
+    s = input.back().end;
+  } while (s < end);
+
+  for (auto previous = input.begin(), next = previous + 1; next != input.end(); ++previous, ++next) {
+    // Now iterates to all adjacent objects.
+    // For every transition gets the locate time of the
+    // - end of the first          (a)
+    // - the start if the second   (b)
+    // Depending on the difference between 'a' and 'b' different tests are done.
+    std::chrono::local_seconds end_previous{previous->end.time_since_epoch() + previous->offset};
+    std::chrono::local_seconds begin_next{next->begin.time_since_epoch() + next->offset};
+
+    if (end_previous == begin_next) {
+      // unique transition
+      // a |------------|
+      // b              |----------|
+      //                T
+      assert_equal(std::chrono::local_info(
+                       std::chrono::local_info::unique,
+                       *previous,
+                       std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+                   zone.get_info(end_previous - 1s));
+
+      assert_equal(std::chrono::local_info(
+                       std::chrono::local_info::unique,
+                       *next,
+                       std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+                   zone.get_info(begin_next));
+
+    } else if (end_previous < begin_next) {
+      // non-existent transition
+      // a |------------|
+      // b                 |----------|
+      //                T  T
+      assert_equal(std::chrono::local_info(
+                       std::chrono::local_info::unique,
+                       *previous,
+                       std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+                   zone.get_info(end_previous - 1s));
+
+      assert_equal(std::chrono::local_info(std::chrono::local_info::nonexistent, *previous, *next),
+                   zone.get_info(end_previous));
+
+      assert_equal(std::chrono::local_info(std::chrono::local_info::nonexistent, *previous, *next),
+                   zone.get_info(begin_next - 1s));
+
+      assert_equal(std::chrono::local_info(
+                       std::chrono::local_info::unique,
+                       *next,
+                       std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+                   zone.get_info(begin_next));
+
+    } else {
+      // ambiguous transition
+      // a |------------|
+      // b           |----------|
+      //             T  T
+      assert_equal(std::chrono::local_info(
+                       std::chrono::local_info::unique,
+                       *previous,
+                       std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+                   zone.get_info(begin_next - 1s));
+
+      assert_equal(std::chrono::local_info(std::chrono::local_info::ambiguous, *previous, *next),
+                   zone.get_info(begin_next));
+
+      assert_equal(std::chrono::local_info(std::chrono::local_info::ambiguous, *previous, *next),
+                   zone.get_info(end_previous - 1s));
+
+      assert_equal(std::chrono::local_info(
+                       std::chrono::local_info::unique,
+                       *next,
+                       std::chrono::sys_info(std::chrono::sys_seconds(0s), std::chrono::sys_seconds(0s), 0s, 0min, "")),
+                   zone.get_info(end_previous));
+    }
+  }
+}
+
+int main(int, const char**) {
+  test_gmt();
+  test_local_time_out_of_range();
+  test_indian_kerguelen();
+  test_antarctica_rothera();
+
+  test_asia_hong_kong();
+  test_europe_berlin();
+  test_europe_dublin();
+  test_america_st_johns();
+
+  const std::chrono::tzdb& tzdb = std::chrono::get_tzdb();
+  for (const auto& zone : tzdb.zones) {
+    validate_transitions(zone);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_local.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_local.pass.cpp
new file mode 100644
index 0000000..28d6145
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_local.pass.cpp
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class time_zone;
+
+// template <class _Duration>
+// local_time<common_type_t<Duration, seconds>>
+//   to_local(const sys_time<Duration>& tp) const;
+
+#include <chrono>
+#include <format>
+#include <cassert>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+int main(int, char**) {
+  // To make sure the test does not depend on changes in the database it uses a
+  // time zone with a fixed offset.
+  using namespace std::literals::chrono_literals;
+
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Etc/GMT+1");
+
+  assert(tz->to_local(std::chrono::sys_time<std::chrono::nanoseconds>{-1ns}) ==
+         std::chrono::local_time<std::chrono::nanoseconds>{-1ns - 1h});
+
+  assert(tz->to_local(std::chrono::sys_time<std::chrono::microseconds>{0us}) ==
+         std::chrono::local_time<std::chrono::microseconds>{0us - 1h});
+
+  assert(tz->to_local(
+             std::chrono::sys_time<std::chrono::seconds>{std::chrono::sys_days{std::chrono::January / 1 / -21970}}) ==
+         std::chrono::local_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / -21970}).time_since_epoch() - 1h});
+
+  assert(
+      tz->to_local(std::chrono::sys_time<std::chrono::days>{std::chrono::sys_days{std::chrono::January / 1 / 21970}}) ==
+      std::chrono::local_time<std::chrono::seconds>{
+          (std::chrono::sys_days{std::chrono::January / 1 / 21970}).time_since_epoch() - 1h});
+
+  assert(tz->to_local(std::chrono::sys_time<std::chrono::weeks>{}) ==
+         std::chrono::local_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 1970}).time_since_epoch() - 1h});
+
+  assert(tz->to_local(std::chrono::sys_time<std::chrono::months>{}) ==
+         std::chrono::local_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 1970}).time_since_epoch() - 1h});
+
+  assert(tz->to_local(std::chrono::sys_time<std::chrono::years>{}) ==
+         std::chrono::local_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 1970}).time_since_epoch() - 1h});
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
new file mode 100644
index 0000000..874c3d5
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
@@ -0,0 +1,237 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class time_zone;
+
+// template <class _Duration>
+// sys_time<common_type_t<Duration, seconds>>
+//   to_sys(const local_time<Duration>& tp) const;
+
+#include <chrono>
+#include <format>
+#include <cassert>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+// Tests unique conversions. To make sure the test is does not depend on changes
+// in the database it uses a time zone with a fixed offset.
+static void test_unique() {
+  using namespace std::literals::chrono_literals;
+
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Etc/GMT+1");
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::nanoseconds>{-1ns}) ==
+         std::chrono::sys_time<std::chrono::nanoseconds>{-1ns + 1h});
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::microseconds>{0us}) ==
+         std::chrono::sys_time<std::chrono::microseconds>{1h});
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / -21970}).time_since_epoch()}) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / -21970}).time_since_epoch() + 1h});
+
+  // sys_time<common_type_t<Duration, seconds>> is seconds for the larger types
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::days>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 21970}).time_since_epoch()}) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 21970}).time_since_epoch() + 1h});
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::weeks>{}) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 1970}).time_since_epoch() + 1h});
+
+  // Note months and years can not be streamed; thus the function cannot be
+  // instantiated for these types. (Even when there is no exception thrown.)
+}
+
+// Tests non-existant conversions.
+static void test_nonexistent() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  using namespace std::literals::chrono_literals;
+
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Europe/Berlin");
+
+  // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+  // ...
+  // 1 DE CE%sT 1980
+  // 1 E CE%sT
+  //
+  // ...
+  // R E 1981 ma - Mar lastSu 1u 1 S
+  // R E 1996 ma - O lastSu 1u 0 -
+
+  // Pick an historic date where it's well known what the time zone rules were.
+  // This makes it unlikely updates to the database change these rules.
+  std::chrono::local_time<std::chrono::seconds> time{
+      (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h + 30min).time_since_epoch()};
+
+  // Validates whether the database did not change.
+  std::chrono::local_info info = tz->get_info(time);
+  assert(info.result == std::chrono::local_info::nonexistent);
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::nonexistent_local_time,
+      [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
+        std::string_view what =
+            R"(1986-03-30 02:30:00.000000000 is in a gap between
+1986-03-30 02:00:00 CET and
+1986-03-30 03:00:00 CEST which are both equivalent to
+1986-03-30 01:00:00 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0ns));
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::nonexistent_local_time,
+      [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
+        std::string_view what =
+            R"(1986-03-30 02:30:00.000000 is in a gap between
+1986-03-30 02:00:00 CET and
+1986-03-30 03:00:00 CEST which are both equivalent to
+1986-03-30 01:00:00 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0us));
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::nonexistent_local_time,
+      [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
+        std::string_view what =
+            R"(1986-03-30 02:30:00.000 is in a gap between
+1986-03-30 02:00:00 CET and
+1986-03-30 03:00:00 CEST which are both equivalent to
+1986-03-30 01:00:00 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0ms));
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::nonexistent_local_time,
+      [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
+        std::string_view what =
+            R"(1986-03-30 02:30:00 is in a gap between
+1986-03-30 02:00:00 CET and
+1986-03-30 03:00:00 CEST which are both equivalent to
+1986-03-30 01:00:00 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0s));
+
+#endif // TEST_HAS_NO_EXCEPTIONS
+}
+
+// Tests ambiguous conversions.
+static void test_ambiguous() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  using namespace std::literals::chrono_literals;
+
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Europe/Berlin");
+
+  // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+  // ...
+  // 1 DE CE%sT 1980
+  // 1 E CE%sT
+  //
+  // ...
+  // R E 1981 ma - Mar lastSu 1u 1 S
+  // R E 1996 ma - O lastSu 1u 0 -
+
+  // Pick an historic date where it's well known what the time zone rules were.
+  // This makes it unlikely updates to the database change these rules.
+  std::chrono::local_time<std::chrono::seconds> time{
+      (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h + 30min).time_since_epoch()};
+
+  // Validates whether the database did not change.
+  std::chrono::local_info info = tz->get_info(time);
+  assert(info.result == std::chrono::local_info::ambiguous);
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::ambiguous_local_time,
+      [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
+        std::string_view what =
+            R"(1986-09-28 02:30:00.000000000 is ambiguous.  It could be
+1986-09-28 02:30:00.000000000 CEST == 1986-09-28 00:30:00.000000000 UTC or
+1986-09-28 02:30:00.000000000 CET == 1986-09-28 01:30:00.000000000 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0ns));
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::ambiguous_local_time,
+      [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
+        std::string_view what =
+            R"(1986-09-28 02:30:00.000000 is ambiguous.  It could be
+1986-09-28 02:30:00.000000 CEST == 1986-09-28 00:30:00.000000 UTC or
+1986-09-28 02:30:00.000000 CET == 1986-09-28 01:30:00.000000 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0us));
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::ambiguous_local_time,
+      [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
+        std::string_view what =
+            R"(1986-09-28 02:30:00.000 is ambiguous.  It could be
+1986-09-28 02:30:00.000 CEST == 1986-09-28 00:30:00.000 UTC or
+1986-09-28 02:30:00.000 CET == 1986-09-28 01:30:00.000 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0ms));
+
+  TEST_VALIDATE_EXCEPTION(
+      std::chrono::ambiguous_local_time,
+      [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
+        std::string_view what =
+            R"(1986-09-28 02:30:00 is ambiguous.  It could be
+1986-09-28 02:30:00 CEST == 1986-09-28 00:30:00 UTC or
+1986-09-28 02:30:00 CET == 1986-09-28 01:30:00 UTC)";
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("Expected exception\n", what, "\n\nActual exception\n", e.what(), '\n'));
+      },
+      tz->to_sys(time + 0s));
+
+#endif // TEST_HAS_NO_EXCEPTIONS
+}
+
+// This test does the basic validations of this function. The library function
+// uses `local_info get_info(const local_time<Duration>& tp)` as implementation
+// detail. The get_info function does extensive testing of the data.
+int main(int, char**) {
+  test_unique();
+  test_nonexistent();
+  test_ambiguous();
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp
new file mode 100644
index 0000000..bad4ef3
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class time_zone;
+
+// template <class _Duration>
+//   sys_time<common_type_t<Duration, seconds>>
+//     to_sys(const local_time<Duration>& tp, choose z) const;
+
+#include <chrono>
+#include <format>
+#include <cassert>
+#include <string_view>
+
+#include "test_macros.h"
+
+// Tests unique conversions. To make sure the test is does not depend on changes
+// in the database it uses a time zone with a fixed offset.
+static void test_unique() {
+  using namespace std::literals::chrono_literals;
+
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Etc/GMT+1");
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::nanoseconds>{-1ns}, std::chrono::choose::earliest) ==
+         std::chrono::sys_time<std::chrono::nanoseconds>{-1ns + 1h});
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::microseconds>{0us}, std::chrono::choose::latest) ==
+         std::chrono::sys_time<std::chrono::microseconds>{1h});
+
+  assert(tz->to_sys(
+             std::chrono::local_time<std::chrono::seconds>{
+                 (std::chrono::sys_days{std::chrono::January / 1 / -21970}).time_since_epoch()},
+             std::chrono::choose::earliest) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / -21970}).time_since_epoch() + 1h});
+
+  // sys_time<common_type_t<Duration, seconds>> is seconds for the larger types
+  assert(tz->to_sys(
+             std::chrono::local_time<std::chrono::days>{
+                 (std::chrono::sys_days{std::chrono::January / 1 / 21970}).time_since_epoch()},
+             std::chrono::choose::latest) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 21970}).time_since_epoch() + 1h});
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::weeks>{}, std::chrono::choose::earliest) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 1970}).time_since_epoch() + 1h});
+
+  // Note months and years cannot be streamed; however these functions don't
+  // throw an exception and thus can be used.
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::months>{}, std::chrono::choose::latest) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 1970}).time_since_epoch() + 1h});
+
+  assert(tz->to_sys(std::chrono::local_time<std::chrono::years>{}, std::chrono::choose::earliest) ==
+         std::chrono::sys_time<std::chrono::seconds>{
+             (std::chrono::sys_days{std::chrono::January / 1 / 1970}).time_since_epoch() + 1h});
+}
+
+// Tests non-existant conversions.
+static void test_nonexistent() {
+  using namespace std::literals::chrono_literals;
+
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Europe/Berlin");
+
+  // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+  // ...
+  // 1 DE CE%sT 1980
+  // 1 E CE%sT
+  //
+  // ...
+  // R E 1981 ma - Mar lastSu 1u 1 S
+  // R E 1996 ma - O lastSu 1u 0 -
+
+  // Pick an historic date where it's well known what the time zone rules were.
+  // This makes it unlikely updates to the database change these rules.
+  std::chrono::local_time<std::chrono::seconds> time{
+      (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h + 30min).time_since_epoch()};
+
+  std::chrono::sys_seconds expected{time.time_since_epoch() - 1h};
+
+  // Validates whether the database did not change.
+  std::chrono::local_info info = tz->get_info(time);
+  assert(info.result == std::chrono::local_info::nonexistent);
+
+  assert(tz->to_sys(time + 0ns, std::chrono::choose::earliest) == expected);
+  assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == expected);
+  assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == expected);
+  assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == expected);
+}
+
+// Tests ambiguous conversions.
+static void test_ambiguous() {
+  using namespace std::literals::chrono_literals;
+
+  const std::chrono::time_zone* tz = std::chrono::locate_zone("Europe/Berlin");
+
+  // Z Europe/Berlin 0:53:28 - LMT 1893 Ap
+  // ...
+  // 1 DE CE%sT 1980
+  // 1 E CE%sT
+  //
+  // ...
+  // R E 1981 ma - Mar lastSu 1u 1 S
+  // R E 1996 ma - O lastSu 1u 0 -
+
+  // Pick an historic date where it's well known what the time zone rules were.
+  // This makes it unlikely updates to the database change these rules.
+  std::chrono::local_time<std::chrono::seconds> time{
+      (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h + 30min).time_since_epoch()};
+
+  std::chrono::sys_seconds earlier{time.time_since_epoch() - 2h};
+  std::chrono::sys_seconds later{time.time_since_epoch() - 1h};
+
+  // Validates whether the database did not change.
+  std::chrono::local_info info = tz->get_info(time);
+  assert(info.result == std::chrono::local_info::ambiguous);
+
+  assert(tz->to_sys(time + 0ns, std::chrono::choose::earliest) == earlier);
+  assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == later);
+  assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == earlier);
+  assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == later);
+}
+
+// This test does the basic validations of this function. The library function
+// uses `local_info get_info(const local_time<Duration>& tp)` as implementation
+// detail. The get_info function does extensive testing of the data.
+int main(int, char**) {
+  test_unique();
+  test_nonexistent();
+  test_ambiguous();
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtraits/const_time_zone_default_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtraits/const_time_zone_default_zone.pass.cpp
new file mode 100644
index 0000000..c652709
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtraits/const_time_zone_default_zone.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// template<> struct zoned_traits<const time_zone*>;
+
+// static const time_zone* default_zone();
+
+#include <chrono>
+#include <cassert>
+
+int main(int, char**) {
+  std::same_as<const std::chrono::time_zone*> decltype(auto) tz =
+      std::chrono::zoned_traits<const std::chrono::time_zone*>::default_zone();
+  assert(tz);
+
+  // The time zone "UTC" can be a link, this means tz->name() can be something
+  // differently. For example, "Etc/UTC". Instead validate whether same time
+  // zone is returned by comparing the addresses.
+  const std::chrono::time_zone* expected = std::chrono::locate_zone("UTC");
+  assert(tz == expected);
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtraits/const_time_zone_locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtraits/const_time_zone_locate_zone.pass.cpp
new file mode 100644
index 0000000..1c81844
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtraits/const_time_zone_locate_zone.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// template<> struct zoned_traits<const time_zone*>;
+
+// static const time_zone* locate_zone(string_view name);
+
+#include <chrono>
+#include <cassert>
+#include <concepts>
+
+#include "assert_macros.h"
+
+static void test(std::string_view name) {
+  std::same_as<const std::chrono::time_zone*> decltype(auto) tz =
+      std::chrono::zoned_traits<const std::chrono::time_zone*>::locate_zone(name);
+
+  const std::chrono::time_zone* expected = std::chrono::locate_zone(name);
+  assert(tz == expected);
+}
+
+int main(int, char**) {
+  test("UTC");
+  test("Europe/Berlin");
+  test("Asia/Hong_Kong");
+
+  TEST_THROWS_TYPE(std::runtime_error,
+                   TEST_IGNORE_NODISCARD std::chrono::zoned_traits<const std::chrono::time_zone*>::locate_zone(
+                       "there_is_no_time_zone_with_this_name"));
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.zonedtraits/types.compile.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.zonedtraits/types.compile.pass.cpp
new file mode 100644
index 0000000..6e34107
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.zonedtraits/types.compile.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// template<class T> struct zoned_traits {};
+//
+// A specialization for const time_zone* is provided by the implementation:
+// template<> struct zoned_traits<const time_zone*> { ... }
+
+#include <chrono>
+#include <type_traits>
+
+// This test test whether non-specialized versions exhibit the expected
+// behavior. (Note these specializations are not really useful.)
+static_assert(std::is_trivial_v<std::chrono::zoned_traits<int>>);
+static_assert(std::is_trivial_v<std::chrono::zoned_traits<float>>);
+static_assert(std::is_trivial_v<std::chrono::zoned_traits<void*>>);
+
+struct foo {};
+static_assert(std::is_empty_v<std::chrono::zoned_traits<foo>>);
+static_assert(std::is_trivial_v<std::chrono::zoned_traits<foo>>);
diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index c0b21f7..9dcecaa 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -84,7 +84,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
     return result
 
 
-DATA_ARRAY_TEMPLATE = """
+DATA_ARRAY_TEMPLATE = r"""
 /// The entries of the characters to escape in format's debug string.
 ///
 /// Contains the entries for [format.string.escaped]/2.2.1.2.1
diff --git a/libcxx/utils/generate_width_estimation_table.py b/libcxx/utils/generate_width_estimation_table.py
index 2fe5149..f4cce10 100644
--- a/libcxx/utils/generate_width_estimation_table.py
+++ b/libcxx/utils/generate_width_estimation_table.py
@@ -99,7 +99,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
     return result
 
 
-DATA_ARRAY_TEMPLATE = """
+DATA_ARRAY_TEMPLATE = r"""
 /// The entries of the characters with an estimated width of 2.
 ///
 /// Contains the entries for [format.string.std]/12
diff --git a/libunwind/test/floatregister.pass.cpp b/libunwind/test/floatregister.pass.cpp
index 64107e6..ce4481b 100644
--- a/libunwind/test/floatregister.pass.cpp
+++ b/libunwind/test/floatregister.pass.cpp
@@ -11,20 +11,27 @@
 
 // Basic test for float registers number are accepted.
 
-#include <dlfcn.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unwind.h>
 
+// Using __attribute__((section("main_func"))) is ELF specific, but then
+// this entire test is marked as requiring Linux, so we should be good.
+//
+// We don't use dladdr() because on musl it's a no-op when statically linked.
+extern char __start_main_func;
+extern char __stop_main_func;
+
 _Unwind_Reason_Code frame_handler(struct _Unwind_Context *ctx, void *arg) {
   (void)arg;
-  Dl_info info = {0, 0, 0, 0};
 
-  // Unwind util the main is reached, above frames depend on the platform and
+  // Unwind until the main is reached, above frames depend on the platform and
   // architecture.
-  if (dladdr(reinterpret_cast<void *>(_Unwind_GetIP(ctx)), &info) &&
-      info.dli_sname && !strcmp("main", info.dli_sname))
+  uintptr_t ip = _Unwind_GetIP(ctx);
+  if (ip >= (uintptr_t)&__start_main_func &&
+      ip < (uintptr_t)&__stop_main_func) {
     _Exit(0);
+  }
 
   return _URC_NO_REASON;
 }
@@ -45,7 +52,7 @@ __attribute__((noinline)) void foo() {
   _Unwind_Backtrace(frame_handler, NULL);
 }
 
-int main() {
+__attribute__((section("main_func"))) int main() {
   foo();
   return -2;
 }
diff --git a/libunwind/test/forceunwind.pass.cpp b/libunwind/test/forceunwind.pass.cpp
index db499d8..344034e 100644
--- a/libunwind/test/forceunwind.pass.cpp
+++ b/libunwind/test/forceunwind.pass.cpp
@@ -17,7 +17,6 @@
 
 #undef NDEBUG
 #include <assert.h>
-#include <dlfcn.h>
 #include <signal.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -27,6 +26,13 @@
 #include <unistd.h>
 #include <unwind.h>
 
+// Using __attribute__((section("main_func"))) is Linux specific, but then
+// this entire test is marked as requiring Linux, so we should be good.
+//
+// We don't use dladdr() because on musl it's a no-op when statically linked.
+extern char __start_main_func;
+extern char __stop_main_func;
+
 void foo();
 _Unwind_Exception ex;
 
@@ -41,14 +47,14 @@ _Unwind_Reason_Code stop(int version, _Unwind_Action actions,
   assert(exceptionObject == &ex);
   assert(stop_parameter == &foo);
 
-  Dl_info info = {0, 0, 0, 0};
-
-  // Unwind util the main is reached, above frames depend on the platform and
+  // Unwind until the main is reached, above frames depend on the platform and
   // architecture.
-  if (dladdr(reinterpret_cast<void *>(_Unwind_GetIP(context)), &info) &&
-      info.dli_sname && !strcmp("main", info.dli_sname)) {
+  uintptr_t ip = _Unwind_GetIP(context);
+  if (ip >= (uintptr_t)&__start_main_func &&
+      ip < (uintptr_t)&__stop_main_func) {
     _Exit(0);
   }
+
   return _URC_NO_REASON;
 }
 
@@ -66,7 +72,7 @@ __attribute__((noinline)) void foo() {
   _Unwind_ForcedUnwind(e, stop, (void *)&foo);
 }
 
-int main() {
+__attribute__((section("main_func"))) int main() {
   foo();
   return -2;
 }
diff --git a/libunwind/test/signal_unwind.pass.cpp b/libunwind/test/signal_unwind.pass.cpp
index 954a5d4..1c15664 100644
--- a/libunwind/test/signal_unwind.pass.cpp
+++ b/libunwind/test/signal_unwind.pass.cpp
@@ -13,9 +13,15 @@
 // TODO: Figure out why this fails with Memory Sanitizer.
 // XFAIL: msan
 
+// Note: this test fails on musl because:
+//
+//  (a) musl disables emission of unwind information for its build, and
+//  (b) musl's signal trampolines don't include unwind information
+//
+// XFAIL: target={{.*}}-musl
+
 #undef NDEBUG
 #include <assert.h>
-#include <dlfcn.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,16 +30,24 @@
 #include <unistd.h>
 #include <unwind.h>
 
+// Using __attribute__((section("main_func"))) is ELF specific, but then
+// this entire test is marked as requiring Linux, so we should be good.
+//
+// We don't use dladdr() because on musl it's a no-op when statically linked.
+extern char __start_main_func;
+extern char __stop_main_func;
+
 _Unwind_Reason_Code frame_handler(struct _Unwind_Context* ctx, void* arg) {
   (void)arg;
-  Dl_info info = { 0, 0, 0, 0 };
 
-  // Unwind util the main is reached, above frames depend on the platform and
+  // Unwind until the main is reached, above frames depend on the platform and
   // architecture.
-  if (dladdr(reinterpret_cast<void *>(_Unwind_GetIP(ctx)), &info) &&
-      info.dli_sname && !strcmp("main", info.dli_sname)) {
+  uintptr_t ip = _Unwind_GetIP(ctx);
+  if (ip >= (uintptr_t)&__start_main_func &&
+      ip < (uintptr_t)&__stop_main_func) {
     _Exit(0);
   }
+
   return _URC_NO_REASON;
 }
 
@@ -43,7 +57,7 @@ void signal_handler(int signum) {
   _Exit(-1);
 }
 
-int main(int, char**) {
+__attribute__((section("main_func"))) int main(int, char **) {
   signal(SIGUSR1, signal_handler);
   kill(getpid(), SIGUSR1);
   return -2;
diff --git a/libunwind/test/unwind_leaffunction.pass.cpp b/libunwind/test/unwind_leaffunction.pass.cpp
index 112a596..98de7dc 100644
--- a/libunwind/test/unwind_leaffunction.pass.cpp
+++ b/libunwind/test/unwind_leaffunction.pass.cpp
@@ -13,9 +13,15 @@
 // TODO: Figure out why this fails with Memory Sanitizer.
 // XFAIL: msan
 
+// Note: this test fails on musl because:
+//
+//  (a) musl disables emission of unwind information for its build, and
+//  (b) musl's signal trampolines don't include unwind information
+//
+// XFAIL: target={{.*}}-musl
+
 #undef NDEBUG
 #include <assert.h>
-#include <dlfcn.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,16 +30,24 @@
 #include <unistd.h>
 #include <unwind.h>
 
+// Using __attribute__((section("main_func"))) is ELF specific, but then
+// this entire test is marked as requiring Linux, so we should be good.
+//
+// We don't use dladdr() because on musl it's a no-op when statically linked.
+extern char __start_main_func;
+extern char __stop_main_func;
+
 _Unwind_Reason_Code frame_handler(struct _Unwind_Context* ctx, void* arg) {
   (void)arg;
-  Dl_info info = { 0, 0, 0, 0 };
 
   // Unwind until the main is reached, above frames depend on the platform and
   // architecture.
-  if (dladdr(reinterpret_cast<void *>(_Unwind_GetIP(ctx)), &info) &&
-      info.dli_sname && !strcmp("main", info.dli_sname)) {
+  uintptr_t ip = _Unwind_GetIP(ctx);
+  if (ip >= (uintptr_t)&__start_main_func &&
+      ip < (uintptr_t)&__stop_main_func) {
     _Exit(0);
   }
+
   return _URC_NO_REASON;
 }
 
@@ -56,7 +70,7 @@ __attribute__((noinline)) void crashing_leaf_func(int do_trap) {
     __builtin_trap();
 }
 
-int main(int, char**) {
+__attribute__((section("main_func"))) int main(int, char **) {
   signal(SIGTRAP, signal_handler);
   signal(SIGILL, signal_handler);
   crashing_leaf_func(1);
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index ca69fb2..5ef46f5 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -106,7 +106,7 @@ protected:
            "If the name is empty, the Symbol must be a DefinedCOFF.");
   }
 
-  const unsigned symbolKind : 8;
+  unsigned symbolKind : 8;
   unsigned isExternal : 1;
 
 public:
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index cf5c238..47e6ea1 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -429,19 +429,6 @@ void AArch64::relocate(uint8_t *loc, const Relocation &rel,
   case R_AARCH64_PREL64:
     write64(loc, val);
     break;
-  case R_AARCH64_AUTH_ABS64:
-    // If val is wider than 32 bits, the relocation must have been moved from
-    // .relr.auth.dyn to .rela.dyn, and the addend write is not needed.
-    //
-    // If val fits in 32 bits, we have two potential scenarios:
-    // * True RELR: Write the 32-bit `val`.
-    // * RELA: Even if the value now fits in 32 bits, it might have been
-    //   converted from RELR during an iteration in
-    //   finalizeAddressDependentContent(). Writing the value is harmless
-    //   because dynamic linking ignores it.
-    if (isInt<32>(val))
-      write32(loc, val);
-    break;
   case R_AARCH64_ADD_ABS_LO12_NC:
     or32AArch64Imm(loc, val);
     break;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 9021bbd..e6a0a5b 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -844,6 +844,16 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
       this->sections[i] =
           createInputSection(i, sec, check(obj.getSectionName(sec, shstrtab)));
       break;
+    case SHT_LLVM_LTO:
+      // Discard .llvm.lto in a relocatable link that does not use the bitcode.
+      // The concatenated output does not properly reflect the linking
+      // semantics. In addition, since we do not use the bitcode wrapper format,
+      // the concatenated raw bitcode would be invalid.
+      if (config->relocatable && !config->fatLTOObjects) {
+        sections[i] = &InputSection::discarded;
+        break;
+      }
+      [[fallthrough]];
     default:
       this->sections[i] =
           createInputSection(i, sec, check(obj.getSectionName(sec, shstrtab)));
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index d1017468..7d91b02 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -57,7 +57,7 @@ def Bstatic: F<"Bstatic">, HelpText<"Do not link against shared libraries">;
 
 def build_id: J<"build-id=">, HelpText<"Generate build ID note">,
   MetaVarName<"[fast,md5,sha1,uuid,0x<hexstring>]">;
-def : F<"build-id">, Alias<build_id>, AliasArgs<["fast"]>, HelpText<"Alias for --build-id=fast">;
+def : F<"build-id">, Alias<build_id>, AliasArgs<["sha1"]>, HelpText<"Alias for --build-id=sha1">;
 
 defm check_sections: B<"check-sections",
     "Check section addresses for overlaps (default)",
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 2c02c2e..04db413 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -898,9 +898,9 @@ static void addRelativeReloc(InputSectionBase &isec, uint64_t offsetInSec,
     isec.addReloc({expr, type, offsetInSec, addend, &sym});
     if (shard)
       part.relrDyn->relocsVec[parallel::getThreadIndex()].push_back(
-          {&isec, isec.relocs().size() - 1});
+          {&isec, offsetInSec});
     else
-      part.relrDyn->relocs.push_back({&isec, isec.relocs().size() - 1});
+      part.relrDyn->relocs.push_back({&isec, offsetInSec});
     return;
   }
   part.relaDyn->addRelativeReloc<shard>(target->relativeRel, isec, offsetInSec,
@@ -1154,12 +1154,6 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
         // relative relocation. Use a symbolic relocation instead.
         if (sym.isPreemptible) {
           part.relaDyn->addSymbolReloc(type, *sec, offset, sym, addend, type);
-        } else if (part.relrAuthDyn && sec->addralign >= 2 && offset % 2 == 0) {
-          // When symbol values are determined in
-          // finalizeAddressDependentContent, some .relr.auth.dyn relocations
-          // may be moved to .rela.dyn.
-          sec->addReloc({expr, type, offset, addend, &sym});
-          part.relrAuthDyn->relocs.push_back({sec, sec->relocs().size() - 1});
         } else {
           part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, sec, offset,
                                   DynamicReloc::AddendOnlyWithTargetVA, sym,
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index ad28028..cc423d15 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1420,12 +1420,6 @@ DynamicSection<ELFT>::computeContents() {
     addInt(config->useAndroidRelrTags ? DT_ANDROID_RELRENT : DT_RELRENT,
            sizeof(Elf_Relr));
   }
-  if (part.relrAuthDyn && part.relrAuthDyn->getParent() &&
-      !part.relrAuthDyn->relocs.empty()) {
-    addInSec(DT_AARCH64_AUTH_RELR, *part.relrAuthDyn);
-    addInt(DT_AARCH64_AUTH_RELRSZ, part.relrAuthDyn->getParent()->size);
-    addInt(DT_AARCH64_AUTH_RELRENT, sizeof(Elf_Relr));
-  }
   if (isMain && in.relaPlt->isNeeded()) {
     addInSec(DT_JMPREL, *in.relaPlt);
     entries.emplace_back(DT_PLTRELSZ, addPltRelSz());
@@ -1737,13 +1731,10 @@ template <class ELFT> void RelocationSection<ELFT>::writeTo(uint8_t *buf) {
   }
 }
 
-RelrBaseSection::RelrBaseSection(unsigned concurrency, bool isAArch64Auth)
-    : SyntheticSection(
-          SHF_ALLOC,
-          isAArch64Auth
-              ? SHT_AARCH64_AUTH_RELR
-              : (config->useAndroidRelrTags ? SHT_ANDROID_RELR : SHT_RELR),
-          config->wordsize, isAArch64Auth ? ".relr.auth.dyn" : ".relr.dyn"),
+RelrBaseSection::RelrBaseSection(unsigned concurrency)
+    : SyntheticSection(SHF_ALLOC,
+                       config->useAndroidRelrTags ? SHT_ANDROID_RELR : SHT_RELR,
+                       config->wordsize, ".relr.dyn"),
       relocsVec(concurrency) {}
 
 void RelrBaseSection::mergeRels() {
@@ -2011,8 +2002,8 @@ bool AndroidPackedRelocationSection<ELFT>::updateAllocSize() {
 }
 
 template <class ELFT>
-RelrSection<ELFT>::RelrSection(unsigned concurrency, bool isAArch64Auth)
-    : RelrBaseSection(concurrency, isAArch64Auth) {
+RelrSection<ELFT>::RelrSection(unsigned concurrency)
+    : RelrBaseSection(concurrency) {
   this->entsize = config->wordsize;
 }
 
@@ -4783,9 +4774,6 @@ template <class ELFT> void elf::createSyntheticSections() {
     if (config->relrPackDynRelocs) {
       part.relrDyn = std::make_unique<RelrSection<ELFT>>(threadCount);
       add(*part.relrDyn);
-      part.relrAuthDyn = std::make_unique<RelrSection<ELFT>>(
-          threadCount, /*isAArch64Auth=*/true);
-      add(*part.relrAuthDyn);
     }
 
     if (!config->relocatable) {
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index eaa09ea..34949025 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -548,9 +548,7 @@ public:
   static bool classof(const SectionBase *d) {
     return SyntheticSection::classof(d) &&
            (d->type == llvm::ELF::SHT_RELA || d->type == llvm::ELF::SHT_REL ||
-            d->type == llvm::ELF::SHT_RELR ||
-            (d->type == llvm::ELF::SHT_AARCH64_AUTH_RELR &&
-             config->emachine == llvm::ELF::EM_AARCH64));
+            d->type == llvm::ELF::SHT_RELR);
   }
   int32_t dynamicTag, sizeDynamicTag;
   SmallVector<DynamicReloc, 0> relocs;
@@ -598,17 +596,15 @@ private:
 };
 
 struct RelativeReloc {
-  uint64_t getOffset() const {
-    return inputSec->getVA(inputSec->relocs()[relocIdx].offset);
-  }
+  uint64_t getOffset() const { return inputSec->getVA(offsetInSec); }
 
   const InputSectionBase *inputSec;
-  size_t relocIdx;
+  uint64_t offsetInSec;
 };
 
 class RelrBaseSection : public SyntheticSection {
 public:
-  RelrBaseSection(unsigned concurrency, bool isAArch64Auth = false);
+  RelrBaseSection(unsigned concurrency);
   void mergeRels();
   bool isNeeded() const override {
     return !relocs.empty() ||
@@ -626,7 +622,7 @@ template <class ELFT> class RelrSection final : public RelrBaseSection {
   using Elf_Relr = typename ELFT::Relr;
 
 public:
-  RelrSection(unsigned concurrency, bool isAArch64Auth = false);
+  RelrSection(unsigned concurrency);
 
   bool updateAllocSize() override;
   size_t getSize() const override { return relrRelocs.size() * this->entsize; }
@@ -1464,7 +1460,6 @@ struct Partition {
   std::unique_ptr<PackageMetadataNote> packageMetadataNote;
   std::unique_ptr<RelocationBaseSection> relaDyn;
   std::unique_ptr<RelrBaseSection> relrDyn;
-  std::unique_ptr<RelrBaseSection> relrAuthDyn;
   std::unique_ptr<VersionDefinitionSection> verDef;
   std::unique_ptr<SyntheticSection> verNeed;
   std::unique_ptr<VersionTableSection> verSym;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 0aceb94..640cb2a 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1458,32 +1458,9 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
       in.mipsGot->updateAllocSize();
 
     for (Partition &part : partitions) {
-      // The R_AARCH64_AUTH_RELATIVE has a smaller addend field as bits [63:32]
-      // encode the signing schema. We've put relocations in .relr.auth.dyn
-      // during RelocationScanner::processAux, but the target VA for some of
-      // them might be wider than 32 bits. We can only know the final VA at this
-      // point, so move relocations with large values from .relr.auth.dyn to
-      // .rela.dyn. See also AArch64::relocate.
-      if (part.relrAuthDyn) {
-        auto it = llvm::remove_if(
-            part.relrAuthDyn->relocs, [&part](const RelativeReloc &elem) {
-              const Relocation &reloc = elem.inputSec->relocs()[elem.relocIdx];
-              if (isInt<32>(reloc.sym->getVA(reloc.addend)))
-                return false;
-              part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, elem.inputSec,
-                                      reloc.offset,
-                                      DynamicReloc::AddendOnlyWithTargetVA,
-                                      *reloc.sym, reloc.addend, R_ABS});
-              return true;
-            });
-        changed |= (it != part.relrAuthDyn->relocs.end());
-        part.relrAuthDyn->relocs.erase(it, part.relrAuthDyn->relocs.end());
-      }
       changed |= part.relaDyn->updateAllocSize();
       if (part.relrDyn)
         changed |= part.relrDyn->updateAllocSize();
-      if (part.relrAuthDyn)
-        changed |= part.relrAuthDyn->updateAllocSize();
       if (part.memtagGlobalDescriptors)
         changed |= part.memtagGlobalDescriptors->updateAllocSize();
     }
@@ -1647,14 +1624,6 @@ static void removeUnusedSyntheticSections() {
         auto *sec = cast<SyntheticSection>(s);
         if (sec->getParent() && sec->isNeeded())
           return false;
-        // .relr.auth.dyn relocations may be moved to .rela.dyn in
-        // finalizeAddressDependentContent, making .rela.dyn no longer empty.
-        // Conservatively keep .rela.dyn. .relr.auth.dyn can be made empty, but
-        // we would fail to remove it here.
-        if (config->emachine == EM_AARCH64 && config->relrPackDynRelocs)
-          if (auto *relSec = dyn_cast<RelocationBaseSection>(sec))
-            if (relSec == mainPart->relaDyn.get())
-              return false;
         unused.insert(sec);
         return true;
       });
@@ -1967,10 +1936,6 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
         part.relrDyn->mergeRels();
         finalizeSynthetic(part.relrDyn.get());
       }
-      if (part.relrAuthDyn) {
-        part.relrAuthDyn->mergeRels();
-        finalizeSynthetic(part.relrAuthDyn.get());
-      }
 
       finalizeSynthetic(part.dynSymTab.get());
       finalizeSynthetic(part.gnuHashTab.get());
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 79ddc15..12ea6de 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -48,6 +48,9 @@ ELF Improvements
   and combine relocation sections if their relocated section group members are
   placed to the same output section.
   (`#94704 <https://github.com/llvm/llvm-project/pull/94704>`_)
+* ``--build-id`` now defaults to generating a 20-byte digest ("sha1") instead
+  of 8-byte ("fast"). This improves compatibility with RPM packaging tools.
+  (`#93943 <https://github.com/llvm/llvm-project/pull/93943>`_)
 
 Breaking changes
 ----------------
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index bdc35c0..f9a00b7 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -119,7 +119,7 @@ are calculated from the object contents.
 is not intended to be cryptographically secure.
 .It Fl -build-id
 Synonym for
-.Fl -build-id Ns = Ns Cm fast .
+.Fl -build-id Ns = Ns Cm sha1 .
 .It Fl -call-graph-profile-sort Ns = Ns Ar algorithm
 .Ar algorithm
 may be:
diff --git a/lld/test/ELF/aarch64-reloc-pauth.s b/lld/test/ELF/aarch64-reloc-pauth.s
index f1ce29e..0cfcb16 100644
--- a/lld/test/ELF/aarch64-reloc-pauth.s
+++ b/lld/test/ELF/aarch64-reloc-pauth.s
@@ -1,13 +1,12 @@
 # REQUIRES: aarch64
 
-# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: rm -rf %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o %t.a.o
+# RUN: ld.lld -shared %t.a.o -soname=so -o %t.a.so
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o
 
-# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o a.o
-# RUN: ld.lld -shared a.o -soname=so -o a.so
-# RUN: llvm-mc -filetype=obj -triple=aarch64 main.s -o main.o
-
-# RUN: ld.lld -pie main.o a.so -o main
-# RUN: llvm-readobj -r main | FileCheck --check-prefix=UNPACKED %s
+# RUN: ld.lld -pie %t.o %t.a.so -o %t
+# RUN: llvm-readobj -r %t | FileCheck --check-prefix=UNPACKED %s
 
 # UNPACKED:          Section ({{.+}}) .rela.dyn {
 # UNPACKED-NEXT:       0x30470 R_AARCH64_AUTH_RELATIVE - 0x1
@@ -23,8 +22,8 @@
 # UNPACKED-NEXT:       0x304B0 R_AARCH64_AUTH_ABS64 bar2 0x0
 # UNPACKED-NEXT:     }
 
-# RUN: ld.lld main.o a.so -o main.nopie
-# RUN: llvm-readobj -r main.nopie | FileCheck --check-prefix=NOPIE %s
+# RUN: ld.lld %t.o %t.a.so -o %t.nopie
+# RUN: llvm-readobj -r %t.nopie | FileCheck --check-prefix=NOPIE %s
 
 # NOPIE:      Section ({{.+}}) .rela.dyn {
 # NOPIE:        0x230460 R_AARCH64_AUTH_RELATIVE - 0x200001
@@ -40,95 +39,67 @@
 # NOPIE-NEXT:   0x2304A0 R_AARCH64_AUTH_ABS64 bar2 0x0
 # NOPIE-NEXT: }
 
-# RUN: ld.lld -pie -z pack-relative-relocs main.o a.so -o main.pie
-# RUN: llvm-readelf -S -d -r -x .test main.pie | FileCheck --check-prefixes=RELR,HEX %s
-
-# RELR:      Section Headers:
-# RELR-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
-# RELR:      .rela.dyn RELA {{0*}}[[ADDR1:.+]] {{0*}}[[ADDR1]] 000090 18 A 1 0 8
-# RELR:      .relr.auth.dyn AARCH64_AUTH_RELR {{0*}}[[ADDR2:.+]] {{0*}}[[ADDR2]] 000018 08 A 0 0 8
-
-# RELR:      Dynamic section at offset {{.+}} contains 16 entries
-# RELR:      0x0000000070000012 (AARCH64_AUTH_RELR) 0x[[ADDR2]]
-# RELR-NEXT: 0x0000000070000011 (AARCH64_AUTH_RELRSZ) 24 (bytes)
-# RELR-NEXT: 0x0000000070000013 (AARCH64_AUTH_RELRENT) 8 (bytes)
-
-## Decoded SHT_RELR section is same as UNPACKED,
-## but contains only the relative relocations.
-## Any relative relocations with odd offset or value wider than 32 bits stay in SHT_RELA.
-
-# RELR:      Relocation section '.rela.dyn' at offset 0x[[ADDR1]] contains 6 entries:
-# RELR-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# RELR-NEXT: 0000000000030460  0000000000000411 R_AARCH64_AUTH_RELATIVE           123456789a
-# RELR-NEXT: 0000000000030468  0000000000000411 R_AARCH64_AUTH_RELATIVE           ffffffedcba98766
-# RELR-NEXT: 0000000000030470  0000000000000411 R_AARCH64_AUTH_RELATIVE           8003043f
-# RELR-NEXT: 0000000000030489  0000000000000411 R_AARCH64_AUTH_RELATIVE           4
-# RELR-NEXT: 0000000000030478  0000000100000244 R_AARCH64_AUTH_ABS64   0000000000000000 zed2 + 1111
-# RELR-NEXT: 0000000000030480  0000000200000244 R_AARCH64_AUTH_ABS64   0000000000000000 bar2 + 0
-# RELR-EMPTY:
-# RELR-NEXT: Relocation section '.relr.auth.dyn' at offset 0x[[ADDR2]] contains 5 entries:
-# RELR-NEXT: Index: Entry Address Symbolic Address
-# RELR-NEXT: 0000: 0000000000030440 0000000000030440 $d.0
-# RELR-NEXT: 0001: 000000000000000f 0000000000030448 $d.0 + 0x8
-# RELR-NEXT:  0000000000030450 $d.0 + 0x10
-# RELR-NEXT:  0000000000030458 $d.0 + 0x18
-# RELR-NEXT: 0002: 0000000000030492 0000000000030492 $d.0 + 0x52
+# RUN: ld.lld -pie %t.o %t.a.so -o %t.pie
+# RUN: llvm-readelf -S -d -r -x .test %t.pie | FileCheck --check-prefixes=PIE,HEX %s
+
+# PIE:      Section Headers:
+# PIE-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
+# PIE:      .rela.dyn RELA {{0*}}[[#%x,ADDR1:]]
+# PIE-SAME:                                     {{0*}}[[#ADDR1]] 000108 18 A 1 0 8
+
+# PIE:      Relocation section '.rela.dyn' at offset 0x[[#ADDR1]] contains 11 entries:
+# PIE-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# PIE-NEXT: 0000000000030470  0000000000000411 R_AARCH64_AUTH_RELATIVE 1
+# PIE-NEXT: 0000000000030478  0000000000000411 R_AARCH64_AUTH_RELATIVE 30472
+# PIE-NEXT: 0000000000030480  0000000000000411 R_AARCH64_AUTH_RELATIVE fffffffffffffffd
+# PIE-NEXT: 0000000000030488  0000000000000411 R_AARCH64_AUTH_RELATIVE 12345678
+# PIE-NEXT: 0000000000030490  0000000000000411 R_AARCH64_AUTH_RELATIVE 123456789a
+# PIE-NEXT: 0000000000030498  0000000000000411 R_AARCH64_AUTH_RELATIVE ffffffedcba98766
+# PIE-NEXT: 00000000000304a0  0000000000000411 R_AARCH64_AUTH_RELATIVE 8003046f
+# PIE-NEXT: 00000000000304b9  0000000000000411 R_AARCH64_AUTH_RELATIVE 4
+# PIE-NEXT: 00000000000304c2  0000000000000411 R_AARCH64_AUTH_RELATIVE 30475
+# PIE-NEXT: 00000000000304a8  0000000100000244 R_AARCH64_AUTH_ABS64   0000000000000000 zed2 + 1111
+# PIE-NEXT: 00000000000304b0  0000000200000244 R_AARCH64_AUTH_ABS64   0000000000000000 bar2 + 0
 
 # HEX:      Hex dump of section '.test':
-# HEX-NEXT: 0x00030440 01000000 2a000020 42040300 2b000000
-##                     ^^^^^^^^ Implicit val = 1 = __ehdr_start + 1
+# HEX-NEXT: 0x00030470 00000000 2a000020 00000000 2b000000
 ##                              ^^^^ Discr = 42
 ##                                    ^^ Key (bits 5..6) = DA
-##                                       ^^^^^^^^ Implicit val = 0x30442 = 0x30440 + 2 = .test + 2
 ##                                                ^^^^ Discr = 43
 ##                                                      ^^ Key (bits 5..6) = IA
-# HEX-NEXT: 0x00030450 fdffffff 2c000080 78563412 2d000020
-##                     ^^^^^^^^ Implicit val = -3 = __ehdr_start - 3
+# HEX-NEXT: 0x00030480 00000000 2c000080 00000000 2d000020
 ##                              ^^^^ Discr = 44
 ##                                    ^^ Key (bits 5..6) = IA
 ##                                    ^^ Addr diversity (bit 7) = true
-##                                       ^^^^^^^^ Implicit val = 0x12345678 = __ehdr_start + 0x12345678
 ##                                                ^^^^ Discr = 45
 ##                                                      ^^ Key (bits 5..6) = DA
-# HEX-NEXT: 0x00030460 00000000 2e000020 00000000 2f000020
-##                     ^^^^^^^^ No implicit val (rela reloc due val wider than 32 bits)
+# HEX-NEXT: 0x00030490 00000000 2e000020 00000000 2f000020
 ##                              ^^^^ Discr = 46
 ##                                    ^^ Key (bits 5..6) = DA
-##                                       ^^^^^^^^ No implicit val (rela reloc due to val wider than 32 bits)
 ##                                                ^^^^ Discr = 47
 ##                                                      ^^ Key (bits 5..6) = DA
-# HEX-NEXT: 0x00030470 00000000 30000020 00000000 31000020
-##                     ^^^^^^^^ No implicit val (rela reloc due val wider than 32 bits)
+# HEX-NEXT: 0x000304a0 00000000 30000020 00000000 31000020
 ##                              ^^^^ Discr = 48
 ##                                    ^^ Key (bits 5..6) = DA
-##                                       ^^^^^^^^ No implicit val (rela reloc due to a preemptible symbol)
 ##                                                ^^^^ Discr = 49
 ##                                                      ^^ Key (bits 5..6) = DA
-# HEX-NEXT: 0x00030480 00000000 32000000 77000000 00330000
-##                     ^^^^^^^^ No implicit val (rela reloc due to a preemptible symbol)
+# HEX-NEXT: 0x000304b0 00000000 32000000 77000000 00330000
 ##                              ^^^^ Discr = 50
 ##                                    ^^ Key (bits 5..6) = IA
-##                                         ^^^^^^ ^^ No implicit val (rela reloc due to odd offset)
 ##                                                  ^^^^ Discr = 51
-# HEX-NEXT: 0x00030490 20774504 03003400 0020{{\ }}
+# HEX-NEXT: 0x000304c0 20770000 00003400 0020{{\ }}
 ##                     ^^ Key (bits 5..6) = DA
-##                         ^^^^ ^^^^ Implicit val = 0x30445 = 0x30440 + 5 = .test + 5
 ##                                  ^^^^ Discr = 52
 ##                                         ^^ Key (bits 5..6) = DA
 
-#--- main.s
-
 .section .test, "aw"
 .p2align 3
 .quad (__ehdr_start + 1)@AUTH(da,42)
 .quad (.test + 2)@AUTH(ia,43)
 .quad (__ehdr_start - 3)@AUTH(ia,44,addr)
 .quad (__ehdr_start + 0x12345678)@AUTH(da,45)
-## Addend wider than 32 bits, not enough room for storing implicitly, would go to rela
 .quad (__ehdr_start + 0x123456789A)@AUTH(da,46)
-## Negative addend wider than 32 bits, not enough room for storing implicitly, would go to rela
 .quad (__ehdr_start - 0x123456789A)@AUTH(da,47)
-## INT32_MAX plus non-zero .test is wider than 32 bits, not enough room for storing implicitly, would go to rela
 .quad (.test + 0x7FFFFFFF)@AUTH(da,48)
 .quad (zed2 + 0x1111)@AUTH(da,49)
 .quad bar2@AUTH(ia,50)
@@ -136,71 +107,3 @@
 .quad (__ehdr_start + 4)@AUTH(da,51)
 .byte 0x77
 .quad (.test + 5)@AUTH(da,52)
-
-#--- empty-relr.s
-
-## .relr.auth.dyn relocations that do not fit 32 bits are moved to .rela.dyn.
-## In this case .relr.auth.dyn will be made empty, but
-## removeUnusedSyntheticSections fails to remove the section.
-
-# RUN: llvm-mc -filetype=obj -triple=aarch64 empty-relr.s -o empty-relr.o
-# RUN: ld.lld -pie -z pack-relative-relocs empty-relr.o -o empty-relr
-# RUN: llvm-readelf -S -d -r empty-relr | FileCheck --check-prefixes=EMPTY-RELR %s
-
-# EMPTY-RELR:      Section Headers:
-# EMPTY-RELR-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
-# EMPTY-RELR:      .rela.dyn RELA {{0*}}[[ADDR1:.+]] {{0*}}[[ADDR1]] 000018 18 A 0 0 8
-# EMPTY-RELR:      .relr.auth.dyn AARCH64_AUTH_RELR {{0*}}[[ADDR2:.+]] {{0*}}[[ADDR2]] 000000 08 A 0 0 8
-
-# EMPTY-RELR:      Dynamic section at offset {{.+}} contains 12 entries
-# EMPTY-RELR-NOT:  (AARCH64_AUTH_RELR)
-# EMPTY-RELR-NOT:  (AARCH64_AUTH_RELRSZ)
-# EMPTY-RELR-NOT:  (AARCH64_AUTH_RELRENT)
-# EMPTY-RELR:      0x0000000000000007 (RELA) 0x[[ADDR1]]
-# EMPTY-RELR-NEXT: 0x0000000000000008 (RELASZ) 24 (bytes)
-# EMPTY-RELR-NEXT: 0x0000000000000009 (RELAENT) 24 (bytes)
-
-# EMPTY-RELR:      Relocation section '.rela.dyn' at offset {{.+}} contains 1 entries:
-# EMPTY-RELR-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# EMPTY-RELR-NEXT: 0000000000030320  0000000000000411 R_AARCH64_AUTH_RELATIVE           8003031f
-# EMPTY-RELR-EMPTY:
-# EMPTY-RELR-NEXT: Relocation section '.relr.auth.dyn' at offset {{.+}} contains 0 entries:
-# EMPTY-RELR-NEXT: Index: Entry Address Symbolic Address
-
-.section .test, "aw"
-.p2align 3
-.quad (.test + 0x7FFFFFFF)@AUTH(da,42)
-
-#--- empty-rela.s
-
-## .relr.auth.dyn relocations that do not fit 32 bits are moved to .rela.dyn.
-## If this scenario does not happen, .rela.dyn will remain empty,
-## but removeUnusedSyntheticSections fails to remove the section.
-
-# RUN: llvm-mc -filetype=obj -triple=aarch64 empty-rela.s -o empty-rela.o
-# RUN: ld.lld -pie -z pack-relative-relocs empty-rela.o -o empty-rela
-# RUN: llvm-readelf -S -d -r empty-rela | FileCheck --check-prefixes=EMPTY-RELA %s
-
-# EMPTY-RELA:      Section Headers:
-# EMPTY-RELA-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
-# EMPTY-RELA:      .rela.dyn RELA {{0*}}[[ADDR1:.+]] {{0*}}[[ADDR1]] 000000 18 A 0 0 8
-# EMPTY-RELA:      .relr.auth.dyn AARCH64_AUTH_RELR {{0*}}[[ADDR2:.+]] {{0*}}[[ADDR2]] 000008 08 A 0 0 8
-
-# EMPTY-RELA:      Dynamic section at offset {{.+}} contains 12 entries
-# EMPTY-RELA-NOT:  (RELR)
-# EMPTY-RELA-NOT:  (RELRSZ)
-# EMPTY-RELA-NOT:  (RELRENT)
-# EMPTY-RELA:      0x0000000070000012 (AARCH64_AUTH_RELR) 0x[[ADDR2]]
-# EMPTY-RELA-NEXT: 0x0000000070000011 (AARCH64_AUTH_RELRSZ) 8 (bytes)
-# EMPTY-RELA-NEXT: 0x0000000070000013 (AARCH64_AUTH_RELRENT) 8 (bytes)
-
-# EMPTY-RELA:      Relocation section '.rela.dyn' at offset {{.+}} contains 0 entries:
-# EMPTY-RELA-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name
-# EMPTY-RELA-EMPTY:
-# EMPTY-RELA-NEXT: Relocation section '.relr.auth.dyn' at offset {{.+}} contains 1 entries:
-# EMPTY-RELA-NEXT: Index: Entry Address Symbolic Address
-# EMPTY-RELA-NEXT: 0000: 0000000000030310 0000000000030310 $d.0
-
-.section .test, "aw"
-.p2align 3
-.quad (.test + 0x12345678)@AUTH(da,42)
diff --git a/lld/test/ELF/build-id.s b/lld/test/ELF/build-id.s
index 8448809..581f600 100644
--- a/lld/test/ELF/build-id.s
+++ b/lld/test/ELF/build-id.s
@@ -6,11 +6,12 @@
 # RUN: llvm-readobj -S %t2 | FileCheck -check-prefix=ALIGN %s
 
 # RUN: ld.lld --build-id %t -o %t2
-# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=DEFAULT %s
-# RUN: ld.lld --build-id=fast %t -o %t2
-# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=SHA1 %s
 # RUN: ld.lld --build-id %t -o %t2 --threads=1
-# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=SHA1 %s
+
+# RUN: ld.lld --build-id=fast %t -o %t2
+# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=FAST %s
 
 # RUN: ld.lld --build-id=md5 %t -o %t2
 # RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=MD5 %s
@@ -41,7 +42,7 @@
 # RUN: ld.lld --build-id --build-id=none %t -o %t2
 # RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=NONE %s
 # RUN: ld.lld --build-id=none --build-id %t -o %t2
-# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-objdump -s %t2 | FileCheck --check-prefix=SHA1 %s
 
 .globl _start
 _start:
@@ -62,10 +63,10 @@ _start:
 # ALIGN-NEXT: Info:
 # ALIGN-NEXT: AddressAlignment: 4
 
-# DEFAULT:      Contents of section .note.test:
-# DEFAULT:      Contents of section .note.gnu.build-id:
-# DEFAULT-NEXT: 04000000 08000000 03000000 474e5500  ............GNU.
-# DEFAULT-NEXT: 630bc2f5 a2584763
+# FAST:      Contents of section .note.test:
+# FAST:      Contents of section .note.gnu.build-id:
+# FAST-NEXT: 04000000 08000000 03000000 474e5500  ............GNU.
+# FAST-NEXT: 630bc2f5 a2584763
 
 # MD5:      Contents of section .note.gnu.build-id:
 # MD5-NEXT: 04000000 10000000 03000000 474e5500  ............GNU.
diff --git a/lld/test/ELF/fatlto/fatlto.test b/lld/test/ELF/fatlto/fatlto.test
index ed13708..7ec094d 100644
--- a/lld/test/ELF/fatlto/fatlto.test
+++ b/lld/test/ELF/fatlto/fatlto.test
@@ -8,7 +8,6 @@
 ; RUN: opt < a-LTO.ll --module-summary -o a-fatLTO.bc
 ; RUN: llvm-objcopy --add-section=.llvm.lto=a-fatLTO.bc --set-section-flags=.llvm.lto=exclude --set-section-type=.llvm.lto=0x6fff4c0c a-fatLTO.o
 
-
 ; RUN: llc main-LTO.ll --filetype=obj -o main-fatLTO.o --relocation-model=pic
 ; RUN: opt < main-LTO.ll --module-summary -o main-fatLTO.bc
 ; RUN: llvm-objcopy --add-section=.llvm.lto=main-fatLTO.bc --set-section-flags=.llvm.lto=exclude --set-section-type=.llvm.lto=0x6fff4c0c main-fatLTO.o
@@ -17,11 +16,6 @@
 ; RUN: llvm-readelf -S main-fatLTO.o | FileCheck --check-prefix=HAS_LLVM_LTO %s
 
 ;; Make sure that the section flags are set correctly
-; HA_LLVM_LTO: Name: .llvm.lto
-; HA_LLVM_LTO-NEXT: Type: SHT_LLVM_LTO
-; HA_LLVM_LTO-NEXT: Flags
-; HA_LLVM_LTO-NEXT: SHF_EXCLUDE
-
 ; HAS_LLVM_LTO: Name Type Address Off Size ES Flg Lk Inf Al
 ; HAS_LLVM_LTO: .llvm.lto LLVM_LTO {{.*}} 00  WE  0   0  1
 
@@ -64,16 +58,13 @@
 ; RUN: ld.lld -o foo-fatLTO.archive a.a main-LTO.bc --fat-lto-objects
 ; RUN: cmp foo-fatLTO.archive foo-LTO
 
-;; Test FatLTO works with relocatable links using PIC objects
-;; Currently, with PIC relocatable links, FatLTO sections are treated as
-;; orphan sections and incorrectly concatenated together. This test verifies
-;; the current behavior, but should be fixed to either merge those sections
-;; correctly, or to drop them altogether.
+;; Test FatLTO works with relocatable links using PIC objects, and that
+;; SHT_LLVM_LTO sections are discarded.
 ; RUN: llvm-ar rcs fatLTO-pic.a a-fatLTO.o main-fatLTO.o
 ; RUN: llvm-readelf -S fatLTO-pic.a | FileCheck --check-prefix=HAS_LLVM_LTO %s
 
-; RUN: ld.lld --whole-archive fatLTO-pic.a -r -o fatLTO-pic-reolcatable.o
-; RUN: llvm-readelf -S fatLTO-pic-reolcatable.o | FileCheck --check-prefix=HAS_LLVM_LTO %s
+; RUN: ld.lld --whole-archive fatLTO-pic.a -r -o fatLTO-pic-relocatable.o
+; RUN: llvm-readelf -S fatLTO-pic-relocatable.o | FileCheck --check-prefix=CHECK-NON-LTO-TARGET %s
 
 ;--- a-LTO.ll
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/lldb/bindings/interface/SBCommandInterpreterRunOptionsDocstrings.i b/lldb/bindings/interface/SBCommandInterpreterRunOptionsDocstrings.i
index b37da05..a4398d9 100644
--- a/lldb/bindings/interface/SBCommandInterpreterRunOptionsDocstrings.i
+++ b/lldb/bindings/interface/SBCommandInterpreterRunOptionsDocstrings.i
@@ -10,5 +10,8 @@ A default SBCommandInterpreterRunOptions object has:
 * PrintResults:   true
 * PrintErrors:    true
 * AddToHistory:   true
+* AllowRepeats    false
 
+Interactive debug sessions always allow repeats, the AllowRepeats
+run option only affects non-interactive sessions.
 ") lldb::SBCommandInterpreterRunOptions;
diff --git a/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h b/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h
index 69b96926..0f248c9 100644
--- a/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h
+++ b/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h
@@ -72,6 +72,14 @@ public:
 
   void SetSpawnThread(bool);
 
+  bool GetAllowRepeats() const;
+
+  /// By default, RunCommandInterpreter will discard repeats if the
+  /// IOHandler being used is not interactive.  Setting AllowRepeats to true
+  /// will override this behavior and always process empty lines in the input
+  /// as a repeat command.
+  void SetAllowRepeats(bool);
+
 private:
   lldb_private::CommandInterpreterRunOptions *get() const;
 
diff --git a/lldb/include/lldb/Interpreter/CommandInterpreter.h b/lldb/include/lldb/Interpreter/CommandInterpreter.h
index 8863523..48f6618 100644
--- a/lldb/include/lldb/Interpreter/CommandInterpreter.h
+++ b/lldb/include/lldb/Interpreter/CommandInterpreter.h
@@ -93,15 +93,20 @@ public:
   /// \param[in] add_to_history
   ///    If \b true add the commands to the command history. If \b false, don't
   ///    add them.
+  /// \param[in] handle_repeats
+  ///    If \b true then treat empty lines as repeat commands even if the
+  ///    interpreter is non-interactive.
   CommandInterpreterRunOptions(LazyBool stop_on_continue,
                                LazyBool stop_on_error, LazyBool stop_on_crash,
                                LazyBool echo_commands, LazyBool echo_comments,
                                LazyBool print_results, LazyBool print_errors,
-                               LazyBool add_to_history)
+                               LazyBool add_to_history,
+                               LazyBool handle_repeats)
       : m_stop_on_continue(stop_on_continue), m_stop_on_error(stop_on_error),
         m_stop_on_crash(stop_on_crash), m_echo_commands(echo_commands),
         m_echo_comment_commands(echo_comments), m_print_results(print_results),
-        m_print_errors(print_errors), m_add_to_history(add_to_history) {}
+        m_print_errors(print_errors), m_add_to_history(add_to_history),
+        m_allow_repeats(handle_repeats) {}
 
   CommandInterpreterRunOptions() = default;
 
@@ -183,6 +188,12 @@ public:
     m_spawn_thread = spawn_thread ? eLazyBoolYes : eLazyBoolNo;
   }
 
+  bool GetAllowRepeats() const { return DefaultToNo(m_allow_repeats); }
+
+  void SetAllowRepeats(bool allow_repeats) {
+    m_allow_repeats = allow_repeats ? eLazyBoolYes : eLazyBoolNo;
+  }
+
   LazyBool m_stop_on_continue = eLazyBoolCalculate;
   LazyBool m_stop_on_error = eLazyBoolCalculate;
   LazyBool m_stop_on_crash = eLazyBoolCalculate;
@@ -193,6 +204,7 @@ public:
   LazyBool m_add_to_history = eLazyBoolCalculate;
   LazyBool m_auto_handle_events;
   LazyBool m_spawn_thread;
+  LazyBool m_allow_repeats = eLazyBoolCalculate;
 
 private:
   static bool DefaultToYes(LazyBool flag) {
diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp
index f1fb6f9..3d90804 100644
--- a/lldb/source/API/SBBreakpoint.cpp
+++ b/lldb/source/API/SBBreakpoint.cpp
@@ -714,7 +714,7 @@ void SBBreakpoint::GetNames(SBStringList &names) {
         bkpt_sp->GetTarget().GetAPIMutex());
     std::vector<std::string> names_vec;
     bkpt_sp->GetNames(names_vec);
-    for (std::string name : names_vec) {
+    for (const std::string &name : names_vec) {
       names.AppendString(name.c_str());
     }
   }
diff --git a/lldb/source/API/SBCommandInterpreterRunOptions.cpp b/lldb/source/API/SBCommandInterpreterRunOptions.cpp
index 6c6b2aa..0c7581d 100644
--- a/lldb/source/API/SBCommandInterpreterRunOptions.cpp
+++ b/lldb/source/API/SBCommandInterpreterRunOptions.cpp
@@ -164,6 +164,18 @@ void SBCommandInterpreterRunOptions::SetSpawnThread(bool spawn_thread) {
   m_opaque_up->SetSpawnThread(spawn_thread);
 }
 
+bool SBCommandInterpreterRunOptions::GetAllowRepeats() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return m_opaque_up->GetAllowRepeats();
+}
+
+void SBCommandInterpreterRunOptions::SetAllowRepeats(bool allow_repeats) {
+  LLDB_INSTRUMENT_VA(this, allow_repeats);
+
+  m_opaque_up->SetAllowRepeats(allow_repeats);
+}
+
 lldb_private::CommandInterpreterRunOptions *
 SBCommandInterpreterRunOptions::get() const {
   return m_opaque_up.get();
diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index 962ce9b..adb9e64 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -1147,7 +1147,7 @@ void SBTarget::GetBreakpointNames(SBStringList &names) {
 
     std::vector<std::string> name_vec;
     target_sp->GetBreakpointNames(name_vec);
-    for (auto name : name_vec)
+    for (const auto &name : name_vec)
       names.AppendString(name.c_str());
   }
 }
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index ae845e9..dc80d43 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -885,7 +885,7 @@ void Breakpoint::GetDescription(Stream *s, lldb::DescriptionLevel level,
         s->Printf("Names:");
         s->EOL();
         s->IndentMore();
-        for (std::string name : m_name_list) {
+        for (const std::string &name : m_name_list) {
           s->Indent();
           s->Printf("%s\n", name.c_str());
         }
diff --git a/lldb/source/Breakpoint/BreakpointIDList.cpp b/lldb/source/Breakpoint/BreakpointIDList.cpp
index 97af1d4..5fc9f95 100644
--- a/lldb/source/Breakpoint/BreakpointIDList.cpp
+++ b/lldb/source/Breakpoint/BreakpointIDList.cpp
@@ -259,7 +259,7 @@ llvm::Error BreakpointIDList::FindAndReplaceIDRanges(
     
     if (!names_found.empty()) {
       for (BreakpointSP bkpt_sp : target->GetBreakpointList().Breakpoints()) {
-        for (std::string name : names_found) {
+        for (const std::string &name : names_found) {
           if (bkpt_sp->MatchesName(name.c_str())) {
             StreamString canonical_id_str;
             BreakpointID::GetCanonicalReference(
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index acd6294..da995de 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2707,7 +2707,8 @@ enum {
   eHandleCommandFlagEchoCommentCommand = (1u << 3),
   eHandleCommandFlagPrintResult = (1u << 4),
   eHandleCommandFlagPrintErrors = (1u << 5),
-  eHandleCommandFlagStopOnCrash = (1u << 6)
+  eHandleCommandFlagStopOnCrash = (1u << 6),
+  eHandleCommandFlagAllowRepeats = (1u << 7)
 };
 
 void CommandInterpreter::HandleCommandsFromFile(
@@ -3129,14 +3130,19 @@ void CommandInterpreter::IOHandlerInputComplete(IOHandler &io_handler,
       return;
 
   const bool is_interactive = io_handler.GetIsInteractive();
-  if (!is_interactive) {
+  const bool allow_repeats =
+      io_handler.GetFlags().Test(eHandleCommandFlagAllowRepeats);
+
+  if (!is_interactive && !allow_repeats) {
     // When we are not interactive, don't execute blank lines. This will happen
     // sourcing a commands file. We don't want blank lines to repeat the
     // previous command and cause any errors to occur (like redefining an
     // alias, get an error and stop parsing the commands file).
+    // But obey the AllowRepeats flag if the user has set it.
     if (line.empty())
       return;
-
+  }
+  if (!is_interactive) {
     // When using a non-interactive file handle (like when sourcing commands
     // from a file) we need to echo the command out so we don't just see the
     // command output and no command...
@@ -3388,6 +3394,8 @@ CommandInterpreter::GetIOHandler(bool force_create,
         flags |= eHandleCommandFlagPrintResult;
       if (options->m_print_errors != eLazyBoolNo)
         flags |= eHandleCommandFlagPrintErrors;
+      if (options->m_allow_repeats == eLazyBoolYes)
+        flags |= eHandleCommandFlagAllowRepeats;
     } else {
       flags = eHandleCommandFlagEchoCommand | eHandleCommandFlagPrintResult |
               eHandleCommandFlagPrintErrors;
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
index 5ad2f7a..4668c25 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
@@ -506,7 +506,7 @@ uint32_t NativeRegisterContextLinux_arm::SetHardwareWatchpoint(
       return LLDB_INVALID_INDEX32;
     else if (watch_mask <= 0x02)
       size = 2;
-    else if (watch_mask <= 0x04)
+    else
       size = 4;
 
     addr = addr & (~0x03);
diff --git a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
index 7550432..25cee36 100644
--- a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
+++ b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
@@ -92,9 +92,7 @@ bool StopInfoMachException::DeterminePtrauthFailure(ExecutionContext &exe_ctx) {
 
   Target &target = *exe_ctx.GetTargetPtr();
   Process &process = *exe_ctx.GetProcessPtr();
-  ABISP abi_sp = process.GetABI();
   const ArchSpec &arch = target.GetArchitecture();
-  assert(abi_sp && "Missing ABI info");
 
   // Check for a ptrauth-enabled target.
   const bool ptrauth_enabled_target =
@@ -110,6 +108,9 @@ bool StopInfoMachException::DeterminePtrauthFailure(ExecutionContext &exe_ctx) {
     strm.Printf("Note: Possible pointer authentication failure detected.\n");
   };
 
+  ABISP abi_sp = process.GetABI();
+  assert(abi_sp && "Missing ABI info");
+
   // Check if we have a "brk 0xc47x" trap, where the value that failed to
   // authenticate is in x16.
   Address current_address = current_frame->GetFrameCodeAddress();
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 8a47eed..187370e 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -539,9 +539,8 @@ bool GDBRemoteCommunication::DecompressPacket() {
       else if (m_compression_type == CompressionType::ZlibDeflate)
         scratchbuf_size = compression_decode_scratch_buffer_size (COMPRESSION_ZLIB);
       else if (m_compression_type == CompressionType::LZMA)
-        scratchbuf_size = compression_decode_scratch_buffer_size (COMPRESSION_LZMA);
-      else if (m_compression_type == CompressionType::LZFSE)
-        scratchbuf_size = compression_decode_scratch_buffer_size (COMPRESSION_LZFSE);
+        scratchbuf_size =
+            compression_decode_scratch_buffer_size(COMPRESSION_LZMA);
       if (scratchbuf_size > 0) {
         m_decompression_scratch = (void*) malloc (scratchbuf_size);
         m_decompression_scratch_type = m_compression_type;
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/Makefile
index 6914024..3e5da0e 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/Makefile
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/Makefile
@@ -1,4 +1,12 @@
 CXX_SOURCES := main.cpp
 CFLAGS_EXTRAS := -std=c++20
 
+ifeq "1" "$(USE_LIBSTDCPP)"
+  CFLAGS_EXTRAS += -DUSE_LIBSTDCPP
+endif
+
+ifeq "1" "$(USE_LIBCPP)"
+  CFLAGS_EXTRAS += -DUSE_LIBCPP
+endif
+
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/main.cpp
index 8cb81c3..aaf76e0 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/main.cpp
@@ -1,13 +1,27 @@
+#if defined(USE_LIBSTDCPP)
+#include <bits/c++config.h>
+// glibc++ >= 11 and c++20
+#if defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE >= 11
 #include <coroutine>
+#define HAS_CPP_COROUTINES 1
+#endif
+#endif
+
+// libc++ always has 'coroutine' feature.
+#if defined(USE_LIBCPP)
+#include <coroutine>
+#define HAS_CPP_COROUTINES 1
+#endif
 
 bool is_implementation_supported() {
-#ifdef _GLIBCXX_RELEASE
-  return _GLIBCXX_RELEASE >= 11;
-#else
+#ifdef HAS_CPP_COROUTINES
   return true;
+#else
+  return false;
 #endif
 }
 
+#ifdef HAS_CPP_COROUTINES
 // `int_generator` is a stripped down, minimal coroutine generator
 // type.
 struct int_generator {
@@ -39,8 +53,11 @@ int_generator my_generator_func() { co_yield 42; }
 // a place to reliably set a breakpoint on.
 void empty_function_so_we_can_set_a_breakpoint() {}
 
+#endif // HAS_CPP_COROUTINES
+
 int main() {
   bool is_supported = is_implementation_supported();
+#ifdef HAS_CPP_COROUTINES
   int_generator gen = my_generator_func();
   std::coroutine_handle<> type_erased_hdl = gen.hdl;
   std::coroutine_handle<int> incorrectly_typed_hdl =
@@ -48,4 +65,8 @@ int main() {
   gen.hdl.resume();                            // Break at initial_suspend
   gen.hdl.resume();                            // Break after co_yield
   empty_function_so_we_can_set_a_breakpoint(); // Break at final_suspend
+  return 0;
+#else
+  return 0; // Break at initial_suspend
+#endif // HAS_CPP_COROUTINES
 }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
index 99998b2..6c61d21 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
@@ -1,3 +1,23 @@
-CXX_SOURCES := main.cpp
-
-include Makefile.rules
+CXX_SOURCES := main.cpp
+LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
+
+a.out: lib_b lib_a lib_c lib_d
+
+include Makefile.rules
+
+lib_a: lib_b
+	$(MAKE) -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
+		LD_EXTRAS="-L. -l_b"
+
+lib_b:
+	$(MAKE) -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
+
+lib_c:
+	$(MAKE) -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
+
+lib_d:
+	$(MAKE) -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py
index abf761f..47af690 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py
@@ -9,22 +9,47 @@ from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
 
+@skipUnlessPlatform(["linux"] + lldbplatformutil.getDarwinOSTriples())
 class ModuleLoadedNotifysTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     # At least DynamicLoaderDarwin and DynamicLoaderPOSIXDYLD should batch up
     # notifications about newly added/removed libraries.  Other DynamicLoaders may
     # not be written this way.
-    @skipUnlessPlatform(["linux"] + lldbplatformutil.getDarwinOSTriples())
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
         # Find the line number to break inside main().
         self.line = line_number("main.cpp", "// breakpoint")
 
+    def setup_test(self, solibs):
+        if lldb.remote_platform:
+            path = lldb.remote_platform.GetWorkingDirectory()
+            for f in solibs:
+                lldbutil.install_to_target(self, self.getBuildArtifact(f))
+        else:
+            path = self.getBuildDir()
+            if self.dylibPath in os.environ:
+                sep = self.platformContext.shlib_path_separator
+                path = os.environ[self.dylibPath] + sep + path
+        self.runCmd(
+            "settings append target.env-vars '{}={}'".format(self.dylibPath, path)
+        )
+        self.default_path = path
+
     def test_launch_notifications(self):
         """Test that lldb broadcasts newly loaded libraries in batches."""
+
+        expected_solibs = [
+            "lib_a." + self.platformContext.shlib_extension,
+            "lib_b." + self.platformContext.shlib_extension,
+            "lib_c." + self.platformContext.shlib_extension,
+            "lib_d." + self.platformContext.shlib_extension,
+        ]
+
         self.build()
+        self.setup_test(expected_solibs)
+
         exe = self.getBuildArtifact("a.out")
         self.dbg.SetAsync(False)
 
@@ -70,6 +95,8 @@ class ModuleLoadedNotifysTestCase(TestBase):
         total_modules_added_events = 0
         total_modules_removed_events = 0
         already_loaded_modules = []
+        max_solibs_per_event = 0
+        max_solib_chunk_per_event = []
         while listener.GetNextEvent(event):
             if lldb.SBTarget.EventIsTargetEvent(event):
                 if event.GetType() == lldb.SBTarget.eBroadcastBitModulesLoaded:
@@ -91,12 +118,17 @@ class ModuleLoadedNotifysTestCase(TestBase):
                                 "{} is already loaded".format(module),
                             )
                         already_loaded_modules.append(module)
-                        if self.TraceOn():
-                            added_files.append(module.GetFileSpec().GetFilename())
+                        added_files.append(module.GetFileSpec().GetFilename())
                     if self.TraceOn():
                         # print all of the binaries that have been added
                         print("Loaded files: %s" % (", ".join(added_files)))
 
+                    # We will check the latest biggest chunk of loaded solibs.
+                    # We expect all of our solibs in the last chunk of loaded modules.
+                    if solib_count >= max_solibs_per_event:
+                        max_solib_chunk_per_event = added_files.copy()
+                        max_solibs_per_event = solib_count
+
                 if event.GetType() == lldb.SBTarget.eBroadcastBitModulesUnloaded:
                     solib_count = lldb.SBTarget.GetNumModulesFromEvent(event)
                     total_modules_removed_events += 1
@@ -115,9 +147,7 @@ class ModuleLoadedNotifysTestCase(TestBase):
         # binaries in batches.  Check that we got back more than 1 solib per event.
         # In practice on Darwin today, we get back two events for a do-nothing c
         # program: a.out and dyld, and then all the rest of the system libraries.
-        # On Linux we get events for ld.so, [vdso], the binary and then all libraries.
-
-        avg_solibs_added_per_event = round(
-            float(total_solibs_added) / float(total_modules_added_events)
-        )
-        self.assertGreater(avg_solibs_added_per_event, 1)
+        # On Linux we get events for ld.so, [vdso], the binary and then all libraries,
+        # but the different configurations could load a different number of .so modules
+        # per event.
+        self.assertLessEqual(set(expected_solibs), set(max_solib_chunk_per_event))
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp
new file mode 100644
index 0000000..778b46e
--- /dev/null
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp
@@ -0,0 +1,3 @@
+extern "C" int b_function();
+
+extern "C" int a_function() { return b_function(); }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp
new file mode 100644
index 0000000..4f1a403
--- /dev/null
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp
@@ -0,0 +1 @@
+extern "C" int b_function() { return 500; }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp
new file mode 100644
index 0000000..8abd1b1
--- /dev/null
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp
@@ -0,0 +1 @@
+extern "C" int c_function() { return 600; }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp
new file mode 100644
index 0000000..58888a2
--- /dev/null
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp
@@ -0,0 +1 @@
+extern "C" int d_function() { return 700; }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp
index 00130c9..77b38c5 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp
@@ -1,6 +1,16 @@
-#include <stdio.h>
-int main ()
-{
-  puts("running"); // breakpoint here
-  return 0;
-}
+#include <stdio.h>
+
+extern "C" int a_function();
+extern "C" int c_function();
+extern "C" int b_function();
+extern "C" int d_function();
+
+int main() {
+  a_function();
+  b_function();
+  c_function();
+  d_function();
+
+  puts("running"); // breakpoint here
+  return 0;
+}
diff --git a/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py b/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py
index af97493..f677b86 100644
--- a/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py
+++ b/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py
@@ -47,28 +47,66 @@ class CommandRunInterpreterAPICase(TestBase):
         TestBase.setUp(self)
 
         self.stdin_path = self.getBuildArtifact("stdin.txt")
+        self.stdout_path = self.getBuildArtifact("stdout.txt")
+
+    def run_commands_string(
+        self, command_string, options=lldb.SBCommandInterpreterRunOptions()
+    ):
+        """Run the commands in command_string through RunCommandInterpreter.
+        Returns (n_errors, quit_requested, has_crashed, result_string)."""
 
         with open(self.stdin_path, "w") as input_handle:
-            input_handle.write("nonexistingcommand\nquit")
+            input_handle.write(command_string)
 
-        self.dbg.SetInputFile(open(self.stdin_path, "r"))
+        n_errors = 0
+        quit_requested = False
+        has_crashed = False
 
-        # No need to track the output
-        devnull = open(os.devnull, "w")
-        self.dbg.SetOutputFile(devnull)
-        self.dbg.SetErrorFile(devnull)
+        with open(self.stdin_path, "r") as in_fileH, open(
+            self.stdout_path, "w"
+        ) as out_fileH:
+            self.dbg.SetInputFile(in_fileH)
+
+            self.dbg.SetOutputFile(out_fileH)
+            self.dbg.SetErrorFile(out_fileH)
+
+            n_errors, quit_requested, has_crashed = self.dbg.RunCommandInterpreter(
+                True, False, options, 0, False, False
+            )
+
+        result_string = None
+        with open(self.stdout_path, "r") as out_fileH:
+            result_string = out_fileH.read()
+
+        return (n_errors, quit_requested, has_crashed, result_string)
 
     def test_run_session_with_error_and_quit(self):
         """Run non-existing and quit command returns appropriate values"""
 
-        n_errors, quit_requested, has_crashed = self.dbg.RunCommandInterpreter(
-            True, False, lldb.SBCommandInterpreterRunOptions(), 0, False, False
+        n_errors, quit_requested, has_crashed, _ = self.run_commands_string(
+            "nonexistingcommand\nquit\n"
         )
-
         self.assertGreater(n_errors, 0)
         self.assertTrue(quit_requested)
         self.assertFalse(has_crashed)
 
+    def test_allow_repeat(self):
+        """Try auto-repeat of process launch - the command will fail and
+        the auto-repeat will fail because of no auto-repeat."""
+        options = lldb.SBCommandInterpreterRunOptions()
+        options.SetEchoCommands(False)
+        options.SetAllowRepeats(True)
+
+        n_errors, quit_requested, has_crashed, result_str = self.run_commands_string(
+            "process launch\n\n", options
+        )
+        self.assertEqual(n_errors, 2)
+        self.assertFalse(quit_requested)
+        self.assertFalse(has_crashed)
+
+        self.assertIn("invalid target", result_str)
+        self.assertIn("No auto repeat", result_str)
+
 
 class SBCommandInterpreterRunOptionsCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
@@ -86,6 +124,7 @@ class SBCommandInterpreterRunOptionsCase(TestBase):
         self.assertTrue(opts.GetPrintResults())
         self.assertTrue(opts.GetPrintErrors())
         self.assertTrue(opts.GetAddToHistory())
+        self.assertFalse(opts.GetAllowRepeats())
 
         # Invert values
         opts.SetStopOnContinue(not opts.GetStopOnContinue())
@@ -95,6 +134,7 @@ class SBCommandInterpreterRunOptionsCase(TestBase):
         opts.SetPrintResults(not opts.GetPrintResults())
         opts.SetPrintErrors(not opts.GetPrintErrors())
         opts.SetAddToHistory(not opts.GetAddToHistory())
+        opts.SetAllowRepeats(not opts.GetAllowRepeats())
 
         # Check the value changed
         self.assertTrue(opts.GetStopOnContinue())
@@ -104,3 +144,4 @@ class SBCommandInterpreterRunOptionsCase(TestBase):
         self.assertFalse(opts.GetPrintResults())
         self.assertFalse(opts.GetPrintErrors())
         self.assertFalse(opts.GetAddToHistory())
+        self.assertTrue(opts.GetAllowRepeats())
diff --git a/lldb/tools/debugserver/source/JSON.cpp b/lldb/tools/debugserver/source/JSON.cpp
index 315c52a..5453d85 100644
--- a/lldb/tools/debugserver/source/JSON.cpp
+++ b/lldb/tools/debugserver/source/JSON.cpp
@@ -43,7 +43,7 @@ JSONString::JSONString(const std::string &s)
     : JSONValue(JSONValue::Kind::String), m_data(s) {}
 
 void JSONString::Write(std::ostream &s) {
-  s << "\"" << json_string_quote_metachars(m_data).c_str() << "\"";
+  s << "\"" << json_string_quote_metachars(m_data) << "\"";
 }
 
 uint64_t JSONNumber::GetAsUnsigned() const {
@@ -395,7 +395,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
           } else {
             error << "error: got exponent character but no exponent digits at "
                      "offset in float value \""
-                  << value.c_str() << "\"";
+                  << value << "\"";
             value = error.str();
             return Token::Status;
           }
@@ -405,8 +405,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
           if (got_frac_digits) {
             return Token::Float;
           } else {
-            error << "error: no digits after decimal point \"" << value.c_str()
-                  << "\"";
+            error << "error: no digits after decimal point \"" << value << "\"";
             value = error.str();
             return Token::Status;
           }
@@ -417,7 +416,7 @@ JSONParser::Token JSONParser::GetToken(std::string &value) {
           // We need at least some integer digits to make an integer
           return Token::Integer;
         } else {
-          error << "error: no digits negate sign \"" << value.c_str() << "\"";
+          error << "error: no digits negate sign \"" << value << "\"";
           value = error.str();
           return Token::Status;
         }
diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml
index 7eeaae4..7bfaf86 100644
--- a/llvm/bindings/ocaml/llvm/llvm.ml
+++ b/llvm/bindings/ocaml/llvm/llvm.ml
@@ -1135,6 +1135,9 @@ external delete_instruction : llvalue -> unit = "llvm_delete_instruction"
 external builder : llcontext -> llbuilder = "llvm_builder"
 external position_builder : (llbasicblock, llvalue) llpos -> llbuilder -> unit
                           = "llvm_position_builder"
+external position_builder_before_dbg_records : (llbasicblock, llvalue) llpos ->
+                                               llbuilder -> unit
+                                  = "llvm_position_builder_before_dbg_records"
 external insertion_block : llbuilder -> llbasicblock = "llvm_insertion_block"
 external insert_into_builder : llvalue -> string -> llbuilder -> unit
                              = "llvm_insert_into_builder"
@@ -1148,6 +1151,8 @@ let builder_before context i = builder_at context (Before i)
 let builder_at_end context bb = builder_at context (At_end bb)
 
 let position_before i = position_builder (Before i)
+let position_before_dbg_records i =
+  position_builder_before_dbg_records (Before i)
 let position_at_end bb = position_builder (At_end bb)
 
 
diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index 36cc095..89b894b 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -1874,10 +1874,22 @@ val builder_at_end : llcontext -> llbasicblock -> llbuilder
     See the constructor for [llvm::LLVMBuilder]. *)
 val position_builder : (llbasicblock, llvalue) llpos -> llbuilder -> unit
 
+(** [position_builder_before_dbg_records ip bb before_dbg_records] moves the
+    instruction builder [bb] to the position [ip], before any debug records
+    there.
+    See the constructor for [llvm::LLVMBuilder]. *)
+val position_builder_before_dbg_records : (llbasicblock, llvalue) llpos ->
+                                          llbuilder -> unit
+
 (** [position_before ins b] moves the instruction builder [b] to before the
     instruction [isn]. See the method [llvm::LLVMBuilder::SetInsertPoint]. *)
 val position_before : llvalue -> llbuilder -> unit
 
+(** [position_before_dbg_records ins b] moves the instruction builder [b]
+    to before the instruction [isn] and any debug records attached to it.
+    See the method [llvm::LLVMBuilder::SetInsertPoint]. *)
+val position_before_dbg_records : llvalue -> llbuilder -> unit
+
 (** [position_at_end bb b] moves the instruction builder [b] to the end of the
     basic block [bb]. See the method [llvm::LLVMBuilder::SetInsertPoint]. *)
 val position_at_end : llbasicblock -> llbuilder -> unit
diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 26a3ac2..3976a96 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -2005,6 +2005,18 @@ value llvm_builder(value C) {
 }
 
 /* (llbasicblock, llvalue) llpos -> llbuilder -> unit */
+value llvm_position_builder_before_dbg_records(value Pos, value B) {
+  if (Tag_val(Pos) == 0) {
+    LLVMBasicBlockRef BB = BasicBlock_val(Field(Pos, 0));
+    LLVMPositionBuilderAtEnd(Builder_val(B), BB);
+  } else {
+    LLVMValueRef I = Value_val(Field(Pos, 0));
+    LLVMPositionBuilderBeforeInstrAndDbgRecords(Builder_val(B), I);
+  }
+  return Val_unit;
+}
+
+/* (llbasicblock, llvalue) llpos -> llbuilder -> unit */
 value llvm_position_builder(value Pos, value B) {
   if (Tag_val(Pos) == 0) {
     LLVMBasicBlockRef BB = BasicBlock_val(Field(Pos, 0));
diff --git a/llvm/cmake/config.guess b/llvm/cmake/config.guess
index f489623..2444ed7 100644
--- a/llvm/cmake/config.guess
+++ b/llvm/cmake/config.guess
@@ -4,7 +4,7 @@
 #   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
 #   2011 Free Software Foundation, Inc.
 
-timestamp='2011-08-20'
+timestamp='2024-06-07'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -1028,7 +1028,11 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
+	if [ "$(grep -Ei 'debian|ubuntu' /etc/lsb-release)" ]; then
+	    echo ${UNAME_MACHINE}-linux-gnu
+	else
+	    echo ${UNAME_MACHINE}-ibm-linux
+	fi
 	exit ;;
     sh64*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
diff --git a/llvm/docs/BranchWeightMetadata.rst b/llvm/docs/BranchWeightMetadata.rst
index 522f37c..6220475 100644
--- a/llvm/docs/BranchWeightMetadata.rst
+++ b/llvm/docs/BranchWeightMetadata.rst
@@ -28,11 +28,14 @@ Supported Instructions
 
 Metadata is only assigned to the conditional branches. There are two extra
 operands for the true and the false branch.
+We optionally track if the metadata was added by ``__builtin_expect`` or
+``__builtin_expect_with_probability`` with an optional field ``!"expected"``.
 
 .. code-block:: none
 
   !0 = !{
     !"branch_weights",
+    [ !"expected", ]
     i32 <TRUE_BRANCH_WEIGHT>,
     i32 <FALSE_BRANCH_WEIGHT>
   }
@@ -47,6 +50,7 @@ is always case #0).
 
   !0 = !{
     !"branch_weights",
+    [ !"expected", ]
     i32 <DEFAULT_BRANCH_WEIGHT>
     [ , i32 <CASE_BRANCH_WEIGHT> ... ]
   }
@@ -60,6 +64,7 @@ Branch weights are assigned to every destination.
 
   !0 = !{
     !"branch_weights",
+    [ !"expected", ]
     i32 <LABEL_BRANCH_WEIGHT>
     [ , i32 <LABEL_BRANCH_WEIGHT> ... ]
   }
@@ -75,6 +80,7 @@ block and entry counts which may not be accurate with sampling.
 
   !0 = !{
     !"branch_weights",
+    [ !"expected", ]
     i32 <CALL_BRANCH_WEIGHT>
   }
 
@@ -95,6 +101,7 @@ is used.
 
   !0 = !{
     !"branch_weights",
+    [ !"expected", ]
     i32 <INVOKE_NORMAL_WEIGHT>
     [ , i32 <INVOKE_UNWIND_WEIGHT> ]
   }
diff --git a/llvm/docs/CompileCudaWithLLVM.rst b/llvm/docs/CompileCudaWithLLVM.rst
index 631691e..0371d7a 100644
--- a/llvm/docs/CompileCudaWithLLVM.rst
+++ b/llvm/docs/CompileCudaWithLLVM.rst
@@ -418,6 +418,17 @@ the compiler chooses to inline ``host_only``.
 Member functions, including constructors, may be overloaded using H and D
 attributes.  However, destructors cannot be overloaded.
 
+Clang Warnings for Host and Device Function Declarations
+--------------------------------------------------------
+
+Clang can emit warnings when it detects that host (H) and device (D) functions are declared or defined with the same signature. These warnings are not enabled by default.
+
+To enable these warnings, use the following compiler flag:
+
+.. code-block:: console
+
+    -Wnvcc-compat
+
 Using a Different Class on Host/Device
 --------------------------------------
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index e1e1652..8cdb9db 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -214,6 +214,8 @@ Changes to the C API
   * ``LLVMConstICmp``
   * ``LLVMConstFCmp``
 
+* Added ``LLVMPositionBuilderBeforeDbgRecords`` and ``LLVMPositionBuilderBeforeInstrAndDbgRecords``. Same as ``LLVMPositionBuilder`` and ``LLVMPositionBuilderBefore`` except the insertion position is set to before the debug records that precede the target instruction. See the `debug info migration guide <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_ for more info. ``LLVMPositionBuilder`` and ``LLVMPositionBuilderBefore`` are unchanged; they insert before the indicated instruction but after any attached debug records.
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
@@ -223,6 +225,13 @@ Changes to the Metadata Info
 Changes to the Debug Info
 ---------------------------------
 
+* LLVM has switched from using debug intrinsics internally to using debug
+  records by default. This should happen transparently when using the DIBuilder
+  to construct debug variable information, but will require changes for any code
+  that interacts with debug intrinsics directly. Debug intrinsics will only be
+  supported on a best-effort basis from here onwards; for more information, see
+  the `migration docs <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
+
 Changes to the LLVM tools
 ---------------------------------
 * llvm-nm and llvm-objdump can now print symbol information from linked
diff --git a/llvm/docs/RemoveDIsDebugInfo.md b/llvm/docs/RemoveDIsDebugInfo.md
index 3cdf63c..56634f7 100644
--- a/llvm/docs/RemoveDIsDebugInfo.md
+++ b/llvm/docs/RemoveDIsDebugInfo.md
@@ -34,9 +34,110 @@ The second matter is that if you transfer sequences of instructions from one pla
 
 For a more in-depth overview of how to update existing code to support debug records, see [the guide below](#how-to-update-existing-code).
 
+## Textual IR Changes
+
+As we change from using debug intrinsics to debug records, any tools that depend on parsing IR produced by LLVM will need to handle the new format. For the most part, the difference between the printed form of a debug intrinsic call and a debug record is trivial:
+
+1. An extra 2 spaces of indentation are added.
+2. The text `(tail|notail|musttail)? call void @llvm.dbg.<type>` is replaced with `#dbg_<type>`.
+3. The leading `metadata ` is removed from each argument to the intrinsic.
+4. The DILocation changes from being an instruction attachment with the format `!dbg !<Num>`, to being an ordinary argument, i.e. `!<Num>`, that is passed as the final argument to the debug record.
+
+Following these rules, we have this example of a debug intrinsic and the equivalent debug record:
+
+```
+; Debug Intrinsic:
+  call void @llvm.dbg.value(metadata i32 %add, metadata !10, metadata !DIExpression()), !dbg !20
+; Debug Record:
+    #dbg_value(i32 %add, !10, !DIExpression(), !20)
+```
+
+### Test updates
+
+Any tests downstream of the main LLVM repo that test the IR output of LLVM may break as a result of the change to using records. Updating an individual test to expect records instead of intrinsics should be trivial, given the update rules above. Updating many tests may be burdensome however; to update the lit tests in the main repository, the following steps were used:
+
+1. Collect the list of failing lit tests into a single file, `failing-tests.txt`, separated by (and ending with) newlines.
+2. Use the following line to split the failing tests into tests that use update_test_checks and tests that don't:
+    ```
+    $ while IFS= read -r f; do grep -q "Assertions have been autogenerated by" "$f" && echo "$f" >> update-checks-tests.txt || echo "$f" >> manual-tests.txt; done < failing-tests.txt
+    ```
+3. For the tests that use update_test_checks, run the appropriate update_test_checks script - for the main LLVM repo, this was achieved with:
+    ```
+    $ xargs ./llvm/utils/update_test_checks.py --opt-binary ./build/bin/opt < update-checks-tests.txt
+    $ xargs ./llvm/utils/update_cc_test_checks.py --llvm-bin ./build/bin/ < update-checks-tests.txt
+    ```
+4. The remaining tests can be manually updated, although if there is a large number of tests then the following scripts may be useful; firstly, a script used to extract the check-line prefixes from a file:
+    ```
+    $ cat ./get-checks.sh
+    #!/bin/bash
+
+    # Always add CHECK, since it's more effort than it's worth to filter files where
+    # every RUN line uses other check prefixes.
+    # Then detect every instance of "check-prefix(es)=..." and add the
+    # comma-separated arguments as extra checks.
+    for filename in "$@"
+    do
+        echo "$filename,CHECK"
+        allchecks=$(grep -Eo 'check-prefix(es)?[ =][A-Z0-9_,-]+' $filename | sed -E 's/.+[= ]([A-Z0-9_,-]+).*/\1/g; s/,/\n/g')
+        for check in $allchecks; do
+            echo "$filename,$check"
+        done
+    done
+    ```
+    Then a second script to perform the work of actually updating the check-lines in each of the failing tests, with a series of simple substitution patterns:
+    ```
+    $ cat ./substitute-checks.sh
+    #!/bin/bash
+
+    file="$1"
+    check="$2"
+
+    # Any test that explicitly tests debug intrinsic output is not suitable to
+    # update by this script.
+    if grep -q "write-experimental-debuginfo=false" "$file"; then
+        exit 0
+    fi
+
+    sed -i -E -e "
+    /(#|;|\/\/).*$check[A-Z0-9_\-]*:/!b
+    /DIGlobalVariableExpression/b
+    /!llvm.dbg./bpostcall
+    s/((((((no|must)?tail )?call.*)?void )?@)?llvm.)?dbg\.([a-z]+)/#dbg_\7/
+    :postcall
+    /declare #dbg_/d
+    s/metadata //g
+    s/metadata\{/{/g
+    s/DIExpression\(([^)]*)\)\)(,( !dbg)?)?/DIExpression(\1),/
+    /#dbg_/!b
+    s/((\))?(,) )?!dbg (![0-9]+)/\3\4\2/
+    s/((\))?(, ))?!dbg/\3/
+    " "$file"
+    ```
+    Both of these scripts combined can be used on the list in `manual-tests.txt` as follows:
+    ```
+    $ cat manual-tests.txt | xargs ./get-checks.sh | sort | uniq | awk -F ',' '{ system("./substitute-checks.sh " $1 " " $2) }'
+    ```
+    These scripts dealt successfully with the vast majority of checks in `clang/test` and `llvm/test`.
+5. Verify the resulting tests pass, and detect any failing tests:
+    ```
+    $ xargs ./build/bin/llvm-lit -q < failing-tests.txt
+    ********************
+    Failed Tests (5):
+    LLVM :: DebugInfo/Generic/dbg-value-lower-linenos.ll
+    LLVM :: Transforms/HotColdSplit/transfer-debug-info.ll
+    LLVM :: Transforms/ObjCARC/basic.ll
+    LLVM :: Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+    LLVM :: Transforms/SafeStack/X86/debug-loc2.ll
+
+
+    Total Discovered Tests: 295
+    Failed: 5 (1.69%)
+    ```
+6. Some tests may have failed - the update scripts are simplistic and preserve no context across lines, and so there are cases that they will not handle; the remaining cases must be manually updated (or handled by further scripts).
+
 # C-API changes
 
-All the functions that have been added are temporary and will be deprecated in the future. The intention is that they'll help downstream projects adapt during the transition period.
+Some new functions that have been added are temporary and will be deprecated in the future. The intention is that they'll help downstream projects adapt during the transition period.
 
 ```
 New functions (all to be deprecated)
@@ -60,8 +161,20 @@ LLVMDIBuilderInsertDeclareBefore   # Insert a debug record (new debug info forma
 LLVMDIBuilderInsertDeclareAtEnd    # Same as above.
 LLVMDIBuilderInsertDbgValueBefore  # Same as above.
 LLVMDIBuilderInsertDbgValueAtEnd   # Same as above.
+
+New functions (no plans to deprecate)
+----------------------------------
+LLVMPositionBuilderBeforeDbgRecords          # See info below.
+LLVMPositionBuilderBeforeInstrAndDbgRecords  # See info below.
 ```
 
+`LLVMPositionBuilderBeforeDbgRecords` and `LLVMPositionBuilderBeforeInstrAndDbgRecords` behave the same as `LLVMPositionBuilder` and `LLVMPositionBuilderBefore` except the insertion position is set before the debug records that precede the target instruction. Note that this doesn't mean that debug intrinsics before the chosen instruction are skipped, only debug records (which unlike debug records are not themselves instructions).
+
+If you don't know which function to call then follow this rule:
+If you are trying to insert at the start of a block, or purposfully skip debug intrinsics to determine the insertion point for any other reason, then call the new functions.
+
+`LLVMPositionBuilder` and `LLVMPositionBuilderBefore` are unchanged. They insert before the indicated instruction but after any attached debug records.
+
 # The new "Debug Record" model
 
 Below is a brief overview of the new representation that replaces debug intrinsics; for an instructive guide on updating old code, see [here](#how-to-update-existing-code).
diff --git a/llvm/docs/TableGen/BackGuide.rst b/llvm/docs/TableGen/BackGuide.rst
index e1413c1..60677a6 100644
--- a/llvm/docs/TableGen/BackGuide.rst
+++ b/llvm/docs/TableGen/BackGuide.rst
@@ -761,7 +761,7 @@ over time. The output looks like this.
 
   -------------------- Global Variables (5) --------------------
 
-  AMDGPUBufferIntrinsics = [int_amdgcn_buffer_load_format, ...
+  AMDGPUBufferIntrinsics = [int_amdgcn_s_buffer_load, ...
   AMDGPUImageDimAtomicIntrinsics = [int_amdgcn_image_atomic_swap_1d, ...
   ...
   -------------------- Classes (758) --------------------
diff --git a/llvm/docs/TestSuiteGuide.md b/llvm/docs/TestSuiteGuide.md
index 85623da..9d43e1a 100644
--- a/llvm/docs/TestSuiteGuide.md
+++ b/llvm/docs/TestSuiteGuide.md
@@ -71,6 +71,9 @@ MicroBenchmarks/XRay microbenchmarks, you need to add `compiler-rt` to your
    PASS: test-suite :: MultiSource/Applications/ALAC/encode/alacconvert-encode.test (2 of 474)
    ...
    ```
+**NOTE!** even in the case you only want to get the compile-time results(code size, llvm stats etc),
+you need to run the test with the above `llvm-lit` command. In that case, the *results.json* file will
+contain compile-time metrics.
 
 6. Show and compare result files (optional):
 
diff --git a/llvm/examples/BrainF/BrainF.cpp b/llvm/examples/BrainF/BrainF.cpp
index 1c7cacb..ac01961 100644
--- a/llvm/examples/BrainF/BrainF.cpp
+++ b/llvm/examples/BrainF/BrainF.cpp
@@ -37,7 +37,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/Casting.h"
 #include <cstdlib>
 #include <iostream>
 
diff --git a/llvm/examples/BrainF/BrainFDriver.cpp b/llvm/examples/BrainF/BrainFDriver.cpp
index 6448347..98fa735 100644
--- a/llvm/examples/BrainF/BrainFDriver.cpp
+++ b/llvm/examples/BrainF/BrainFDriver.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -38,13 +37,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 7d6b8c7..d4a10e9 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -3951,9 +3951,28 @@ const unsigned *LLVMGetIndices(LLVMValueRef Inst);
 
 LLVMBuilderRef LLVMCreateBuilderInContext(LLVMContextRef C);
 LLVMBuilderRef LLVMCreateBuilder(void);
+/**
+ * Set the builder position before Instr but after any attached debug records,
+ * or if Instr is null set the position to the end of Block.
+ */
 void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block,
                          LLVMValueRef Instr);
+/**
+ * Set the builder position before Instr and any attached debug records,
+ * or if Instr is null set the position to the end of Block.
+ */
+void LLVMPositionBuilderBeforeDbgRecords(LLVMBuilderRef Builder,
+                                         LLVMBasicBlockRef Block,
+                                         LLVMValueRef Inst);
+/**
+ * Set the builder position before Instr but after any attached debug records.
+ */
 void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr);
+/**
+ * Set the builder position before Instr and any attached debug records.
+ */
+void LLVMPositionBuilderBeforeInstrAndDbgRecords(LLVMBuilderRef Builder,
+                                                 LLVMValueRef Instr);
 void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block);
 LLVMBasicBlockRef LLVMGetInsertBlock(LLVMBuilderRef Builder);
 void LLVMClearInsertionPosition(LLVMBuilderRef Builder);
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 44a301e..78faadb 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -964,6 +964,13 @@ public:
     return Val;
   }
 
+  /// Factory for Positive and Negative One.
+  ///
+  /// \param Negative True iff the number should be negative.
+  static APFloat getOne(const fltSemantics &Sem, bool Negative = false) {
+    return APFloat(Sem, Negative ? -1 : 1);
+  }
+
   /// Factory for Positive and Negative Infinity.
   ///
   /// \param Negative True iff the number should be negative.
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index b2dcdfa..e687254 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -337,7 +337,6 @@ namespace llvm {
 
     // Top-Level Entities
     bool parseTopLevelEntities();
-    bool finalizeDebugInfoFormat(Module *M);
     void dropUnknownMetadataReferences();
     bool validateEndOfModule(bool UpgradeDebugInfo);
     bool validateEndOfIndex();
diff --git a/llvm/include/llvm/Frontend/Directive/DirectiveBase.td b/llvm/include/llvm/Frontend/Directive/DirectiveBase.td
index ce532e0..707239c 100644
--- a/llvm/include/llvm/Frontend/Directive/DirectiveBase.td
+++ b/llvm/include/llvm/Frontend/Directive/DirectiveBase.td
@@ -156,6 +156,18 @@ def AS_FromLeaves : Association<"FromLeaves"> {}    // See below
 // The name "AS_FromLeaves" is recognized by TableGen, and there is no enum
 // generated for it.
 
+// Kinds of directive categories.
+class Category<string n> {
+  string name = n;  // Name of the enum value in enum class Category.
+}
+
+def CA_Declarative: Category<"Declarative"> {}
+def CA_Executable: Category<"Executable"> {}
+def CA_Informational: Category<"Informational"> {}
+def CA_Meta: Category<"Meta"> {}
+def CA_Subsidiary: Category<"Subsidiary"> {}
+def CA_Utility: Category<"Utility"> {}
+
 // Information about a specific directive.
 class Directive<string d> {
   // Name of the directive. Can be composite directive sepearted by whitespace.
@@ -190,4 +202,7 @@ class Directive<string d> {
 
   // What the directive is associated with.
   Association association = AS_FromLeaves;
+
+  // The category of the directive.
+  Category category = ?;
 }
diff --git a/llvm/include/llvm/Frontend/OpenACC/ACC.td b/llvm/include/llvm/Frontend/OpenACC/ACC.td
index dfa6a22..cda1d96 100644
--- a/llvm/include/llvm/Frontend/OpenACC/ACC.td
+++ b/llvm/include/llvm/Frontend/OpenACC/ACC.td
@@ -268,6 +268,7 @@ def ACCC_Unknown : Clause<"unknown"> {
 // 2.12
 def ACC_Atomic : Directive<"atomic"> {
   let association = AS_Block;
+  let category = CA_Executable;
 }
 
 // 2.6.5
@@ -293,6 +294,7 @@ def ACC_Data : Directive<"data"> {
     VersionedClause<ACCC_Present>
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 
 // 2.13
@@ -308,6 +310,7 @@ def ACC_Declare : Directive<"declare"> {
     VersionedClause<ACCC_Link>
   ];
   let association = AS_None;
+  let category = CA_Declarative;
 }
 
 // 2.5.3
@@ -334,6 +337,7 @@ def ACC_Kernels : Directive<"kernels"> {
     VersionedClause<ACCC_VectorLength>
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 
 // 2.5.1
@@ -363,6 +367,7 @@ def ACC_Parallel : Directive<"parallel"> {
     VersionedClause<ACCC_Self>
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 
 // 2.5.2
@@ -391,6 +396,7 @@ def ACC_Serial : Directive<"serial"> {
     VersionedClause<ACCC_Self>
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 
 // 2.9
@@ -411,11 +417,13 @@ def ACC_Loop : Directive<"loop"> {
     VersionedClause<ACCC_Seq>
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 
 // 2.10
 def ACC_Cache : Directive<"cache"> {
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.14.1
@@ -426,6 +434,7 @@ def ACC_Init : Directive<"init"> {
     VersionedClause<ACCC_If>
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.15.1
@@ -442,6 +451,7 @@ def ACC_Routine : Directive<"routine"> {
     VersionedClause<ACCC_NoHost>
   ];
   let association = AS_Declaration;
+  let category = CA_Declarative;
 }
 
 // 2.14.3
@@ -461,6 +471,7 @@ def ACC_Set : Directive<"set"> {
     VersionedClause<ACCC_DeviceType>
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.14.2
@@ -471,6 +482,7 @@ def ACC_Shutdown : Directive<"shutdown"> {
     VersionedClause<ACCC_If>
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.14.4
@@ -490,6 +502,7 @@ def ACC_Update : Directive<"update"> {
     VersionedClause<ACCC_Self>
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.16.3
@@ -499,6 +512,7 @@ def ACC_Wait : Directive<"wait"> {
     VersionedClause<ACCC_If>
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.14.6
@@ -516,6 +530,7 @@ def ACC_EnterData : Directive<"enter data"> {
     VersionedClause<ACCC_Copyin>
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.14.7
@@ -534,6 +549,7 @@ def ACC_ExitData : Directive<"exit data"> {
     VersionedClause<ACCC_Detach>
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // 2.8
@@ -546,6 +562,7 @@ def ACC_HostData : Directive<"host_data"> {
     VersionedClause<ACCC_UseDevice>
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 
 // 2.11
@@ -584,6 +601,7 @@ def ACC_KernelsLoop : Directive<"kernels loop"> {
     VersionedClause<ACCC_Seq>
   ];
   let leafConstructs = [ACC_Kernels, ACC_Loop];
+  let category = CA_Executable;
 }
 
 // 2.11
@@ -623,6 +641,7 @@ def ACC_ParallelLoop : Directive<"parallel loop"> {
     VersionedClause<ACCC_Seq>
   ];
   let leafConstructs = [ACC_Parallel, ACC_Loop];
+  let category = CA_Executable;
 }
 
 // 2.11
@@ -659,9 +678,11 @@ def ACC_SerialLoop : Directive<"serial loop"> {
     VersionedClause<ACCC_Seq>
   ];
   let leafConstructs = [ACC_Serial, ACC_Loop];
+  let category = CA_Executable;
 }
 
 def ACC_Unknown : Directive<"unknown"> {
   let isDefault = true;
   let association = AS_None;
+  let category = CA_Utility;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index afb8d5f..12a944e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -493,18 +493,22 @@ def OMP_Allocate : Directive<"allocate"> {
     VersionedClause<OMPC_Allocator>,
   ];
   let association = AS_None;
+  let category = CA_Declarative;
 }
 def OMP_Allocators : Directive<"allocators"> {
   let allowedClauses = [
     VersionedClause<OMPC_Allocate>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_Assumes : Directive<"assumes"> {
   let association = AS_None;
+  let category = CA_Informational;
 }
 def OMP_EndAssumes : Directive<"end assumes"> {
   let association = AS_Delimited;
+  let category = OMP_Assumes.category;
 }
 def OMP_Atomic : Directive<"atomic"> {
   let allowedClauses = [
@@ -525,12 +529,15 @@ def OMP_Atomic : Directive<"atomic"> {
     VersionedClause<OMPC_Weak, 51>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_Barrier : Directive<"barrier"> {
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_BeginAssumes : Directive<"begin assumes"> {
   let association = AS_Delimited;
+  let category = CA_Informational;
 }
 def OMP_BeginDeclareTarget : Directive<"begin declare target"> {
   let allowedClauses = [
@@ -540,33 +547,40 @@ def OMP_BeginDeclareTarget : Directive<"begin declare target"> {
     VersionedClause<OMPC_To>,
   ];
   let association = AS_Delimited;
+  let category = CA_Declarative;
 }
 def OMP_BeginDeclareVariant : Directive<"begin declare variant"> {
   let association = AS_Delimited;
+  let category = CA_Declarative;
 }
 def OMP_Cancel : Directive<"cancel"> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_If>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_CancellationPoint : Directive<"cancellation point"> {
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_Critical : Directive<"critical"> {
   let allowedClauses = [
     VersionedClause<OMPC_Hint>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_DeclareMapper : Directive<"declare mapper"> {
   let allowedClauses = [
     VersionedClause<OMPC_Map>,
   ];
   let association = AS_None;
+  let category = CA_Declarative;
 }
 def OMP_DeclareReduction : Directive<"declare reduction"> {
   let association = AS_None;
+  let category = CA_Declarative;
 }
 def OMP_DeclareSimd : Directive<"declare simd"> {
   let allowedClauses = [
@@ -582,6 +596,7 @@ def OMP_DeclareSimd : Directive<"declare simd"> {
     VersionedClause<OMPC_Notinbranch>,
   ];
   let association = AS_Declaration;
+  let category = CA_Declarative;
 }
 def OMP_DeclareTarget : Directive<"declare target"> {
   let allowedClauses = [
@@ -594,9 +609,11 @@ def OMP_DeclareTarget : Directive<"declare target"> {
     VersionedClause<OMPC_DeviceType, 50>,
   ];
   let association = AS_None;
+  let category = CA_Declarative;
 }
 def OMP_EndDeclareTarget : Directive<"end declare target"> {
   let association = AS_Delimited;
+  let category = OMP_DeclareTarget.category;
 }
 def OMP_DeclareVariant : Directive<"declare variant"> {
   let allowedClauses = [
@@ -607,9 +624,11 @@ def OMP_DeclareVariant : Directive<"declare variant"> {
     VersionedClause<OMPC_AppendArgs, 51>,
   ];
   let association = AS_Declaration;
+  let category = CA_Declarative;
 }
 def OMP_EndDeclareVariant : Directive<"end declare variant"> {
   let association = AS_Delimited;
+  let category = OMP_DeclareVariant.category;
 }
 def OMP_Depobj : Directive<"depobj"> {
   let allowedClauses = [
@@ -621,6 +640,7 @@ def OMP_Depobj : Directive<"depobj"> {
     VersionedClause<OMPC_Update, 50>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_dispatch : Directive<"dispatch"> {
   let allowedClauses = [
@@ -633,6 +653,7 @@ def OMP_dispatch : Directive<"dispatch"> {
     VersionedClause<OMPC_NoWait>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_Distribute : Directive<"distribute"> {
   let allowedClauses = [
@@ -646,6 +667,7 @@ def OMP_Distribute : Directive<"distribute"> {
     VersionedClause<OMPC_DistSchedule>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_Do : Directive<"do"> {
   let allowedClauses = [
@@ -663,6 +685,7 @@ def OMP_Do : Directive<"do"> {
     VersionedClause<OMPC_Schedule>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_EndDo : Directive<"end do"> {
   let allowedOnceClauses = [
@@ -671,6 +694,7 @@ def OMP_EndDo : Directive<"end do"> {
   // Needed for association computation, since OMP_Do has it "from leafConstructs".
   let leafConstructs = OMP_Do.leafConstructs;
   let association = OMP_Do.association;
+  let category = OMP_Do.category;
 }
 def OMP_Error : Directive<"error"> {
   let allowedClauses = [
@@ -679,6 +703,7 @@ def OMP_Error : Directive<"error"> {
     VersionedClause<OMPC_Severity, 51>,
   ];
   let association = AS_None;
+  let category = CA_Utility;
 }
 def OMP_Flush : Directive<"flush"> {
   let allowedOnceClauses = [
@@ -690,6 +715,7 @@ def OMP_Flush : Directive<"flush"> {
     VersionedClause<OMPC_Release, 50>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_For : Directive<"for"> {
   let allowedClauses = [
@@ -706,6 +732,7 @@ def OMP_For : Directive<"for"> {
     VersionedClause<OMPC_Schedule>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_interop : Directive<"interop"> {
   let allowedClauses = [
@@ -717,6 +744,7 @@ def OMP_interop : Directive<"interop"> {
     VersionedClause<OMPC_Use>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_loop : Directive<"loop"> {
   let allowedClauses = [
@@ -730,15 +758,18 @@ def OMP_loop : Directive<"loop"> {
     VersionedClause<OMPC_Order, 50>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_masked : Directive<"masked"> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Filter>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_Master : Directive<"master"> {
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_Metadirective : Directive<"metadirective"> {
   let allowedClauses = [
@@ -748,9 +779,11 @@ def OMP_Metadirective : Directive<"metadirective"> {
     VersionedClause<OMPC_Default>,
   ];
   let association = AS_None;
+  let category = CA_Meta;
 }
 def OMP_Nothing : Directive<"nothing"> {
   let association = AS_None;
+  let category = CA_Utility;
 }
 def OMP_Ordered : Directive<"ordered"> {
   let allowedClauses = [
@@ -763,6 +796,7 @@ def OMP_Ordered : Directive<"ordered"> {
   ];
   let association = AS_None;
   // There is also a block-associated "ordered" directive.
+  let category = CA_Executable;
 }
 def OMP_Parallel : Directive<"parallel"> {
   let allowedClauses = [
@@ -781,6 +815,7 @@ def OMP_Parallel : Directive<"parallel"> {
     VersionedClause<OMPC_ProcBind>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_Requires : Directive<"requires"> {
   let allowedOnceClauses = [
@@ -799,6 +834,7 @@ def OMP_Requires : Directive<"requires"> {
     VersionedClause<OMPC_ReverseOffload, 99>,
   ];
   let association = AS_None;
+  let category = CA_Informational;
 }
 def OMP_Scan : Directive<"scan"> {
   let allowedClauses = [
@@ -806,6 +842,7 @@ def OMP_Scan : Directive<"scan"> {
     VersionedClause<OMPC_Inclusive, 50>,
   ];
   let association = AS_Separating;
+  let category = CA_Subsidiary;
 }
 def OMP_scope : Directive<"scope"> {
   let allowedClauses = [
@@ -816,9 +853,11 @@ def OMP_scope : Directive<"scope"> {
     VersionedClause<OMPC_NoWait, 51>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_Section : Directive<"section"> {
   let association = AS_Separating;
+  let category = CA_Subsidiary;
 }
 def OMP_Sections : Directive<"sections"> {
   let allowedClauses = [
@@ -830,6 +869,7 @@ def OMP_Sections : Directive<"sections"> {
     VersionedClause<OMPC_Reduction>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_EndSections : Directive<"end sections"> {
   let allowedOnceClauses = [
@@ -837,6 +877,7 @@ def OMP_EndSections : Directive<"end sections"> {
   ];
   let leafConstructs = OMP_Sections.leafConstructs;
   let association = OMP_Sections.association;
+  let category = OMP_Sections.category;
 }
 def OMP_Simd : Directive<"simd"> {
   let allowedClauses = [
@@ -856,6 +897,7 @@ def OMP_Simd : Directive<"simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_Single : Directive<"single"> {
   let allowedClauses = [
@@ -866,6 +908,7 @@ def OMP_Single : Directive<"single"> {
     VersionedClause<OMPC_Private>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_EndSingle : Directive<"end single"> {
   let allowedClauses = [
@@ -876,6 +919,7 @@ def OMP_EndSingle : Directive<"end single"> {
   ];
   let leafConstructs = OMP_Single.leafConstructs;
   let association = OMP_Single.association;
+  let category = OMP_Single.category;
 }
 def OMP_Target : Directive<"target"> {
   let allowedClauses = [
@@ -899,6 +943,7 @@ def OMP_Target : Directive<"target"> {
     VersionedClause<OMPC_ThreadLimit, 51>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_TargetData : Directive<"target data"> {
   let allowedOnceClauses = [
@@ -911,6 +956,7 @@ def OMP_TargetData : Directive<"target data"> {
     VersionedClause<OMPC_UseDevicePtr>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_TargetEnterData : Directive<"target enter data"> {
   let allowedClauses = [
@@ -925,6 +971,7 @@ def OMP_TargetEnterData : Directive<"target enter data"> {
     VersionedClause<OMPC_Map>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_TargetExitData : Directive<"target exit data"> {
   let allowedClauses = [
@@ -939,6 +986,7 @@ def OMP_TargetExitData : Directive<"target exit data"> {
     VersionedClause<OMPC_Map>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_TargetUpdate : Directive<"target update"> {
   let allowedClauses = [
@@ -952,6 +1000,7 @@ def OMP_TargetUpdate : Directive<"target update"> {
     VersionedClause<OMPC_NoWait>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_Task : Directive<"task"> {
   let allowedClauses = [
@@ -973,6 +1022,7 @@ def OMP_Task : Directive<"task"> {
     VersionedClause<OMPC_Priority>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_TaskGroup : Directive<"taskgroup"> {
   let allowedClauses = [
@@ -980,6 +1030,7 @@ def OMP_TaskGroup : Directive<"taskgroup"> {
     VersionedClause<OMPC_TaskReduction, 50>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_TaskLoop : Directive<"taskloop"> {
   let allowedClauses = [
@@ -1006,6 +1057,7 @@ def OMP_TaskLoop : Directive<"taskloop"> {
     VersionedClause<OMPC_NumTasks>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_TaskWait : Directive<"taskwait"> {
   let allowedClauses = [
@@ -1013,9 +1065,11 @@ def OMP_TaskWait : Directive<"taskwait"> {
     VersionedClause<OMPC_NoWait, 51>,
   ];
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_TaskYield : Directive<"taskyield"> {
   let association = AS_None;
+  let category = CA_Executable;
 }
 def OMP_Teams : Directive<"teams"> {
   let allowedClauses = [
@@ -1033,19 +1087,23 @@ def OMP_Teams : Directive<"teams"> {
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_ThreadPrivate : Directive<"threadprivate"> {
   let association = AS_None;
+  let category = CA_Declarative;
 }
 def OMP_Tile : Directive<"tile"> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Sizes, 51>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_Unknown : Directive<"unknown"> {
   let isDefault = true;
   let association = AS_None;
+  let category = CA_Utility;
 }
 def OMP_Unroll : Directive<"unroll"> {
   let allowedOnceClauses = [
@@ -1053,12 +1111,14 @@ def OMP_Unroll : Directive<"unroll"> {
     VersionedClause<OMPC_Partial, 51>,
   ];
   let association = AS_Loop;
+  let category = CA_Executable;
 }
 def OMP_Workshare : Directive<"workshare"> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_NoWait>,
   ];
   let association = AS_Block;
+  let category = CA_Executable;
 }
 def OMP_EndWorkshare : Directive<"end workshare"> {
   let allowedClauses = [
@@ -1066,6 +1126,7 @@ def OMP_EndWorkshare : Directive<"end workshare"> {
   ];
   let leafConstructs = OMP_Workshare.leafConstructs;
   let association = OMP_Workshare.association;
+  let category = OMP_Workshare.category;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1097,6 +1158,7 @@ def OMP_DistributeParallelDo : Directive<"distribute parallel do"> {
     VersionedClause<OMPC_Schedule>,
   ];
   let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_Do];
+  let category = CA_Executable;
 }
 def OMP_DistributeParallelDoSimd : Directive<"distribute parallel do simd"> {
   let allowedClauses = [
@@ -1122,6 +1184,7 @@ def OMP_DistributeParallelDoSimd : Directive<"distribute parallel do simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_DistributeParallelFor : Directive<"distribute parallel for"> {
   let allowedClauses = [
@@ -1143,6 +1206,7 @@ def OMP_DistributeParallelFor : Directive<"distribute parallel for"> {
     VersionedClause<OMPC_Shared>,
   ];
   let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_For];
+  let category = CA_Executable;
 }
 def OMP_DistributeParallelForSimd : Directive<"distribute parallel for simd"> {
   let allowedClauses = [
@@ -1169,6 +1233,7 @@ def OMP_DistributeParallelForSimd : Directive<"distribute parallel for simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_DistributeSimd : Directive<"distribute simd"> {
   let allowedClauses = [
@@ -1196,6 +1261,7 @@ def OMP_DistributeSimd : Directive<"distribute simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let leafConstructs = [OMP_Distribute, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_DoSimd : Directive<"do simd"> {
   let allowedClauses = [
@@ -1217,6 +1283,7 @@ def OMP_DoSimd : Directive<"do simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let leafConstructs = [OMP_Do, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_EndDoSimd : Directive<"end do simd"> {
   let allowedOnceClauses = [
@@ -1224,6 +1291,7 @@ def OMP_EndDoSimd : Directive<"end do simd"> {
   ];
   let leafConstructs = OMP_DoSimd.leafConstructs;
   let association = OMP_DoSimd.association;
+  let category = OMP_DoSimd.category;
 }
 def OMP_ForSimd : Directive<"for simd"> {
   let allowedClauses = [
@@ -1245,6 +1313,7 @@ def OMP_ForSimd : Directive<"for simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let leafConstructs = [OMP_For, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_MaskedTaskloop : Directive<"masked taskloop"> {
   let allowedClauses = [
@@ -1268,6 +1337,7 @@ def OMP_MaskedTaskloop : Directive<"masked taskloop"> {
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_masked, OMP_TaskLoop];
+  let category = CA_Executable;
 }
 def OMP_MaskedTaskloopSimd : Directive<"masked taskloop simd"> {
   let allowedClauses = [
@@ -1297,6 +1367,7 @@ def OMP_MaskedTaskloopSimd : Directive<"masked taskloop simd"> {
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_masked, OMP_TaskLoop, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_MasterTaskloop : Directive<"master taskloop"> {
   let allowedClauses = [
@@ -1319,6 +1390,7 @@ def OMP_MasterTaskloop : Directive<"master taskloop"> {
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_Master, OMP_TaskLoop];
+  let category = CA_Executable;
 }
 def OMP_MasterTaskloopSimd : Directive<"master taskloop simd"> {
   let allowedClauses = [
@@ -1347,6 +1419,7 @@ def OMP_MasterTaskloopSimd : Directive<"master taskloop simd"> {
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_Master, OMP_TaskLoop, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_ParallelDo : Directive<"parallel do"> {
   let allowedClauses = [
@@ -1369,6 +1442,7 @@ def OMP_ParallelDo : Directive<"parallel do"> {
     VersionedClause<OMPC_Schedule>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_Do];
+  let category = CA_Executable;
 }
 def OMP_ParallelDoSimd : Directive<"parallel do simd"> {
   let allowedClauses = [
@@ -1396,6 +1470,7 @@ def OMP_ParallelDoSimd : Directive<"parallel do simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_Do, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_ParallelFor : Directive<"parallel for"> {
   let allowedClauses = [
@@ -1418,6 +1493,7 @@ def OMP_ParallelFor : Directive<"parallel for"> {
     VersionedClause<OMPC_Shared>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_For];
+  let category = CA_Executable;
 }
 def OMP_ParallelForSimd : Directive<"parallel for simd"> {
   let allowedClauses = [
@@ -1444,6 +1520,7 @@ def OMP_ParallelForSimd : Directive<"parallel for simd"> {
     VersionedClause<OMPC_SimdLen>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_For, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_parallel_loop : Directive<"parallel loop"> {
   let allowedClauses = [
@@ -1466,6 +1543,7 @@ def OMP_parallel_loop : Directive<"parallel loop"> {
     VersionedClause<OMPC_ProcBind>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_loop];
+  let category = CA_Executable;
 }
 def OMP_ParallelMasked : Directive<"parallel masked"> {
   let allowedClauses = [
@@ -1483,6 +1561,7 @@ def OMP_ParallelMasked : Directive<"parallel masked"> {
     VersionedClause<OMPC_Shared>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_masked];
+  let category = CA_Executable;
 }
 def OMP_ParallelMaskedTaskloop :
     Directive<"parallel masked taskloop"> {
@@ -1510,6 +1589,7 @@ def OMP_ParallelMaskedTaskloop :
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_masked, OMP_TaskLoop];
+  let category = CA_Executable;
 }
 def OMP_ParallelMaskedTaskloopSimd :
     Directive<"parallel masked taskloop simd"> {
@@ -1543,6 +1623,7 @@ def OMP_ParallelMaskedTaskloopSimd :
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_masked, OMP_TaskLoop, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_ParallelMaster : Directive<"parallel master"> {
   let allowedClauses = [
@@ -1559,6 +1640,7 @@ def OMP_ParallelMaster : Directive<"parallel master"> {
     VersionedClause<OMPC_Shared>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_Master];
+  let category = CA_Executable;
 }
 def OMP_ParallelMasterTaskloop :
     Directive<"parallel master taskloop"> {
@@ -1585,6 +1667,7 @@ def OMP_ParallelMasterTaskloop :
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_Master, OMP_TaskLoop];
+  let category = CA_Executable;
 }
 def OMP_ParallelMasterTaskloopSimd :
     Directive<"parallel master taskloop simd"> {
@@ -1617,6 +1700,7 @@ def OMP_ParallelMasterTaskloopSimd :
     VersionedClause<OMPC_Untied>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_Master, OMP_TaskLoop, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_ParallelSections : Directive<"parallel sections"> {
   let allowedClauses = [
@@ -1636,6 +1720,7 @@ def OMP_ParallelSections : Directive<"parallel sections"> {
     VersionedClause<OMPC_NumThreads>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_Sections];
+  let category = CA_Executable;
 }
 def OMP_ParallelWorkshare : Directive<"parallel workshare"> {
   let allowedClauses = [
@@ -1653,6 +1738,7 @@ def OMP_ParallelWorkshare : Directive<"parallel workshare"> {
     VersionedClause<OMPC_ProcBind>,
   ];
   let leafConstructs = [OMP_Parallel, OMP_Workshare];
+  let category = CA_Executable;
 }
 def OMP_TargetParallel : Directive<"target parallel"> {
   let allowedClauses = [
@@ -1680,6 +1766,7 @@ def OMP_TargetParallel : Directive<"target parallel"> {
     VersionedClause<OMPC_ThreadLimit, 51>,
   ];
   let leafConstructs = [OMP_Target, OMP_Parallel];
+  let category = CA_Executable;
 }
 def OMP_TargetParallelDo : Directive<"target parallel do"> {
   let allowedClauses = [
@@ -1711,6 +1798,7 @@ def OMP_TargetParallelDo : Directive<"target parallel do"> {
     VersionedClause<OMPC_Schedule>,
   ];
   let leafConstructs = [OMP_Target, OMP_Parallel, OMP_Do];
+  let category = CA_Executable;
 }
 def OMP_TargetParallelDoSimd : Directive<"target parallel do simd"> {
   let allowedClauses = [
@@ -1743,6 +1831,7 @@ def OMP_TargetParallelDoSimd : Directive<"target parallel do simd"> {
     VersionedClause<OMPC_UsesAllocators>,
   ];
   let leafConstructs = [OMP_Target, OMP_Parallel, OMP_Do, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_TargetParallelFor : Directive<"target parallel for"> {
   let allowedClauses = [
@@ -1776,6 +1865,7 @@ def OMP_TargetParallelFor : Directive<"target parallel for"> {
     VersionedClause<OMPC_ThreadLimit, 51>,
   ];
   let leafConstructs = [OMP_Target, OMP_Parallel, OMP_For];
+  let category = CA_Executable;
 }
 def OMP_TargetParallelForSimd : Directive<"target parallel for simd"> {
   let allowedClauses = [
@@ -1813,6 +1903,7 @@ def OMP_TargetParallelForSimd : Directive<"target parallel for simd"> {
     VersionedClause<OMPC_ThreadLimit, 51>,
   ];
   let leafConstructs = [OMP_Target, OMP_Parallel, OMP_For, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_target_parallel_loop : Directive<"target parallel loop"> {
   let allowedClauses = [
@@ -1845,6 +1936,7 @@ def OMP_target_parallel_loop : Directive<"target parallel loop"> {
     VersionedClause<OMPC_ThreadLimit, 51>,
   ];
   let leafConstructs = [OMP_Target, OMP_Parallel, OMP_loop];
+  let category = CA_Executable;
 }
 def OMP_TargetSimd : Directive<"target simd"> {
   let allowedClauses = [
@@ -1880,6 +1972,7 @@ def OMP_TargetSimd : Directive<"target simd"> {
     VersionedClause<OMPC_ThreadLimit, 51>,
   ];
   let leafConstructs = [OMP_Target, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_TargetTeams : Directive<"target teams"> {
   let allowedClauses = [
@@ -1907,6 +2000,7 @@ def OMP_TargetTeams : Directive<"target teams"> {
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams];
+  let category = CA_Executable;
 }
 def OMP_TargetTeamsDistribute : Directive<"target teams distribute"> {
   let allowedClauses = [
@@ -1936,6 +2030,7 @@ def OMP_TargetTeamsDistribute : Directive<"target teams distribute"> {
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute];
+  let category = CA_Executable;
 }
 def OMP_TargetTeamsDistributeParallelDo :
     Directive<"target teams distribute parallel do"> {
@@ -1971,6 +2066,7 @@ def OMP_TargetTeamsDistributeParallelDo :
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do];
+  let category = CA_Executable;
 }
 def OMP_TargetTeamsDistributeParallelDoSimd :
     Directive<"target teams distribute parallel do simd"> {
@@ -2010,6 +2106,7 @@ def OMP_TargetTeamsDistributeParallelDoSimd :
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_TargetTeamsDistributeParallelFor :
     Directive<"target teams distribute parallel for"> {
@@ -2044,6 +2141,7 @@ def OMP_TargetTeamsDistributeParallelFor :
     VersionedClause<OMPC_OMPX_DynCGroupMem>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For];
+  let category = CA_Executable;
 }
 def OMP_TargetTeamsDistributeParallelForSimd :
     Directive<"target teams distribute parallel for simd"> {
@@ -2083,6 +2181,7 @@ def OMP_TargetTeamsDistributeParallelForSimd :
     VersionedClause<OMPC_OMPX_DynCGroupMem>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_TargetTeamsDistributeSimd :
     Directive<"target teams distribute simd"> {
@@ -2118,6 +2217,7 @@ def OMP_TargetTeamsDistributeSimd :
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_target_teams_loop : Directive<"target teams loop"> {
   let allowedClauses = [
@@ -2148,6 +2248,7 @@ def OMP_target_teams_loop : Directive<"target teams loop"> {
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Target, OMP_Teams, OMP_loop];
+  let category = CA_Executable;
 }
 def OMP_TaskLoopSimd : Directive<"taskloop simd"> {
   let allowedClauses = [
@@ -2180,6 +2281,7 @@ def OMP_TaskLoopSimd : Directive<"taskloop simd"> {
     VersionedClause<OMPC_NumTasks>,
   ];
   let leafConstructs = [OMP_TaskLoop, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_TeamsDistribute : Directive<"teams distribute"> {
   let allowedClauses = [
@@ -2200,6 +2302,7 @@ def OMP_TeamsDistribute : Directive<"teams distribute"> {
     VersionedClause<OMPC_If>,
   ];
   let leafConstructs = [OMP_Teams, OMP_Distribute];
+  let category = CA_Executable;
 }
 def OMP_TeamsDistributeParallelDo :
     Directive<"teams distribute parallel do"> {
@@ -2227,6 +2330,7 @@ def OMP_TeamsDistributeParallelDo :
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do];
+  let category = CA_Executable;
 }
 def OMP_TeamsDistributeParallelDoSimd :
     Directive<"teams distribute parallel do simd"> {
@@ -2256,6 +2360,7 @@ def OMP_TeamsDistributeParallelDoSimd :
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_TeamsDistributeParallelFor :
     Directive<"teams distribute parallel for"> {
@@ -2280,6 +2385,7 @@ def OMP_TeamsDistributeParallelFor :
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For];
+  let category = CA_Executable;
 }
 def OMP_TeamsDistributeParallelForSimd :
     Directive<"teams distribute parallel for simd"> {
@@ -2308,6 +2414,7 @@ def OMP_TeamsDistributeParallelForSimd :
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_TeamsDistributeSimd : Directive<"teams distribute simd"> {
   let allowedClauses = [
@@ -2334,6 +2441,7 @@ def OMP_TeamsDistributeSimd : Directive<"teams distribute simd"> {
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Simd];
+  let category = CA_Executable;
 }
 def OMP_teams_loop : Directive<"teams loop"> {
   let allowedClauses = [
@@ -2354,4 +2462,5 @@ def OMP_teams_loop : Directive<"teams loop"> {
     VersionedClause<OMPC_ThreadLimit>,
   ];
   let leafConstructs = [OMP_Teams, OMP_loop];
+  let category = CA_Executable;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfb966d..4db98e1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1083,18 +1083,6 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
 
 defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
 
-class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
-  [data_ty],
-  [llvm_v4i32_ty,     // rsrc(SGPR)
-   llvm_i32_ty,       // vindex(VGPR)
-   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
-   llvm_i1_ty,        // glc(imm)
-   llvm_i1_ty],       // slc(imm)
-  [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
-def int_amdgcn_buffer_load : AMDGPUBufferLoad;
-
 // Generate a buffer_load instruction that may be optimized to s_buffer_load if
 // the offset argument is uniform.
 def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
@@ -1111,25 +1099,12 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
   [IntrNoMem, ImmArg<ArgIndex<2>>]>,
   AMDGPURsrcIntrinsic<0>;
 
-class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
-  [],
-  [data_ty,          // vdata(VGPR)
-   llvm_v4i32_ty,     // rsrc(SGPR)
-   llvm_i32_ty,       // vindex(VGPR)
-   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
-   llvm_i1_ty,        // glc(imm)
-   llvm_i1_ty],       // slc(imm)
-  [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<1>;
-def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
-def int_amdgcn_buffer_store : AMDGPUBufferStore;
-
-// New buffer intrinsics with separate raw and struct variants.  The raw
+// Buffer intrinsics with separate raw and struct variants.  The raw
 // variant never has an index. The struct variant always has an index, even if
 // it is const 0. A struct intrinsic with constant 0 index is different to the
 // corresponding raw intrinsic on gfx9+ because the behavior of bound checking
 // and swizzling changes depending on whether idxen is set in the instruction.
-// These new instrinsics also keep the offset and soffset arguments separate as
+// These instrinsics also keep the offset and soffset arguments separate as
 // they behave differently in bounds checking and swizzling.
 
 // The versions of these intrinsics that take <4 x i32> arguments are deprecated
@@ -1489,41 +1464,7 @@ def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloa
 def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
 def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
 
-// Obsolescent tbuffer intrinsics.
-def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic <
-    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
-    [llvm_v4i32_ty,   // rsrc(SGPR)
-     llvm_i32_ty,     // vindex(VGPR)
-     llvm_i32_ty,     // voffset(VGPR)
-     llvm_i32_ty,     // soffset(SGPR)
-     llvm_i32_ty,     // offset(imm)
-     llvm_i32_ty,     // dfmt(imm)
-     llvm_i32_ty,     // nfmt(imm)
-     llvm_i1_ty,     // glc(imm)
-     llvm_i1_ty],    // slc(imm)
-    [IntrReadMem,
-     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
-     ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<0>;
-
-def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic <
-    [],
-    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
-     llvm_v4i32_ty,  // rsrc(SGPR)
-     llvm_i32_ty,    // vindex(VGPR)
-     llvm_i32_ty,    // voffset(VGPR)
-     llvm_i32_ty,    // soffset(SGPR)
-     llvm_i32_ty,    // offset(imm)
-     llvm_i32_ty,    // dfmt(imm)
-     llvm_i32_ty,    // nfmt(imm)
-     llvm_i1_ty,     // glc(imm)
-     llvm_i1_ty],    // slc(imm)
-    [IntrWriteMem, ImmArg<ArgIndex<5>>,
-     ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
-     ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<1>;
-
-// New tbuffer intrinsics, with:
+// tbuffer intrinsics, with:
 // - raw and struct variants
 // - joint format field
 // - joint cachepolicy field
@@ -1670,51 +1611,6 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
      ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
-class AMDGPUBufferAtomic : Intrinsic <
-  [llvm_anyint_ty],
-  [LLVMMatchType<0>,       // vdata(VGPR)
-   llvm_v4i32_ty,     // rsrc(SGPR)
-   llvm_i32_ty,       // vindex(VGPR)
-   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
-   llvm_i1_ty],       // slc(imm)
-  [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<1, 0>;
-def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
-  [llvm_i32_ty],
-  [llvm_i32_ty,       // src(VGPR)
-   llvm_i32_ty,       // cmp(VGPR)
-   llvm_v4i32_ty,     // rsrc(SGPR)
-   llvm_i32_ty,       // vindex(VGPR)
-   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
-   llvm_i1_ty],       // slc(imm)
-  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<2, 0>;
-
-def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
-
-class AMDGPUBufferAtomicFP : Intrinsic <
-  [llvm_anyfloat_ty],
-  [LLVMMatchType<0>, // vdata(VGPR)
-   llvm_v4i32_ty,    // rsrc(SGPR)
-   llvm_i32_ty,      // vindex(VGPR)
-   llvm_i32_ty,      // offset(SGPR/VGPR/imm)
-   llvm_i1_ty],      // slc(imm)
-  [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<1, 0>;
-
-// Legacy form of the intrinsic. raw and struct forms should be preferred.
-def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
-
 class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty,             // rsrc(SGPR)
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 4c4e735..2da154c 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -144,14 +144,14 @@ class RISCVVIntrinsic {
 
 let TargetPrefix = "riscv" in {
   // We use anyint here but we only support XLen.
-  def int_riscv_vsetvli   : Intrinsic<[llvm_anyint_ty],
+  def int_riscv_vsetvli   : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                            /* AVL */  [LLVMMatchType<0>,
                            /* VSEW */  LLVMMatchType<0>,
                            /* VLMUL */ LLVMMatchType<0>],
                                       [IntrNoMem,
                                        ImmArg<ArgIndex<1>>,
                                        ImmArg<ArgIndex<2>>]>;
-  def int_riscv_vsetvlimax : Intrinsic<[llvm_anyint_ty],
+  def int_riscv_vsetvlimax : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                             /* VSEW */ [LLVMMatchType<0>,
                             /* VLMUL */ LLVMMatchType<0>],
                                       [IntrNoMem,
@@ -669,7 +669,7 @@ let TargetPrefix = "riscv" in {
   // The destination vector type is the same as first source vector.
   // Input: (passthru, vector_in, vector_in/scalar_in, vxrm, vl)
   class RISCVSaturatingBinaryAAXUnMaskedRoundingMode
-        : Intrinsic<[llvm_anyvector_ty],
+        : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      llvm_anyint_ty, LLVMMatchType<2>],
                     [ImmArg<ArgIndex<3>>, IntrNoMem]>, RISCVVIntrinsic {
@@ -692,7 +692,7 @@ let TargetPrefix = "riscv" in {
   // The destination vector type is the same as first source vector.
   // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vxrm, vl, policy)
   class RISCVSaturatingBinaryAAXMaskedRoundingMode
-        : Intrinsic<[llvm_anyvector_ty],
+        : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
                      LLVMMatchType<2>, LLVMMatchType<2>],
diff --git a/llvm/include/llvm/IR/MDBuilder.h b/llvm/include/llvm/IR/MDBuilder.h
index 3265589..e02ec8f 100644
--- a/llvm/include/llvm/IR/MDBuilder.h
+++ b/llvm/include/llvm/IR/MDBuilder.h
@@ -59,7 +59,11 @@ public:
   //===------------------------------------------------------------------===//
 
   /// Return metadata containing two branch weights.
-  MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight);
+  /// @param TrueWeight the weight of the true branch
+  /// @param FalseWeight the weight of the false branch
+  /// @param Do these weights come from __builtin_expect*
+  MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight,
+                              bool IsExpected = false);
 
   /// Return metadata containing two branch weights, with significant bias
   /// towards `true` destination.
@@ -70,7 +74,10 @@ public:
   MDNode *createUnlikelyBranchWeights();
 
   /// Return metadata containing a number of branch weights.
-  MDNode *createBranchWeights(ArrayRef<uint32_t> Weights);
+  /// @param Weights the weights of all the branches
+  /// @param Do these weights come from __builtin_expect*
+  MDNode *createBranchWeights(ArrayRef<uint32_t> Weights,
+                              bool IsExpected = false);
 
   /// Return metadata specifying that a branch or switch is unpredictable.
   MDNode *createUnpredictable();
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index 88fbad4..1d7c97d 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -55,6 +55,17 @@ MDNode *getBranchWeightMDNode(const Instruction &I);
 /// Nullptr otherwise.
 MDNode *getValidBranchWeightMDNode(const Instruction &I);
 
+/// Check if Branch Weight Metadata has an "expected" field from an llvm.expect*
+/// intrinsic
+bool hasBranchWeightOrigin(const Instruction &I);
+
+/// Check if Branch Weight Metadata has an "expected" field from an llvm.expect*
+/// intrinsic
+bool hasBranchWeightOrigin(const MDNode *ProfileData);
+
+/// Return the offset to the first branch weight data
+unsigned getBranchWeightOffset(const MDNode *ProfileData);
+
 /// Extract branch weights from MD_prof metadata
 ///
 /// \param ProfileData A pointer to an MDNode.
@@ -111,7 +122,11 @@ bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalWeights);
 
 /// Create a new `branch_weights` metadata node and add or overwrite
 /// a `prof` metadata reference to instruction `I`.
-void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights);
+/// \param I the Instruction to set branch weights on.
+/// \param Weights an array of weights to set on instruction I.
+/// \param IsExpected were these weights added from an llvm.expect* intrinsic.
+void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
+                      bool IsExpected);
 
 /// Scaling the profile data attached to 'I' using the ratio of S/T.
 void scaleProfData(Instruction &I, uint64_t S, uint64_t T);
diff --git a/llvm/include/llvm/MC/MCAsmLayout.h b/llvm/include/llvm/MC/MCAsmLayout.h
index 94cfb76..6fbfce7 100644
--- a/llvm/include/llvm/MC/MCAsmLayout.h
+++ b/llvm/include/llvm/MC/MCAsmLayout.h
@@ -31,37 +31,21 @@ class MCAsmLayout {
   /// List of sections in layout order.
   llvm::SmallVector<MCSection *, 16> SectionOrder;
 
-  /// The last fragment which was laid out, or 0 if nothing has been laid
-  /// out. Fragments are always laid out in order, so all fragments with a
-  /// lower ordinal will be valid.
-  mutable DenseMap<const MCSection *, MCFragment *> LastValidFragment;
-
-  /// Make sure that the layout for the given fragment is valid, lazily
-  /// computing it if necessary.
+  /// Compute the layout for the section if necessary.
   void ensureValid(const MCFragment *F) const;
 
-  /// Is the layout for this fragment valid?
-  bool isFragmentValid(const MCFragment *F) const;
-
 public:
   MCAsmLayout(MCAssembler &Assembler);
 
   /// Get the assembler object this is a layout for.
   MCAssembler &getAssembler() const { return Assembler; }
 
-  /// \returns whether the offset of fragment \p F can be obtained via
-  /// getFragmentOffset.
-  bool canGetFragmentOffset(const MCFragment *F) const;
-
   /// Invalidate the fragments starting with F because it has been
   /// resized. The fragment's size should have already been updated, but
   /// its bundle padding will be recomputed.
   void invalidateFragmentsFrom(MCFragment *F);
 
-  /// Perform layout for a single fragment, assuming that the previous
-  /// fragment has already been laid out correctly, and the parent section has
-  /// been initialized.
-  void layoutFragment(MCFragment *Fragment);
+  void layoutBundle(MCFragment *F);
 
   /// \name Section Access (in layout order)
   /// @{
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 3fa5f2f..914c750 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -203,10 +203,6 @@ private:
   /// were adjusted.
   bool layoutOnce(MCAsmLayout &Layout);
 
-  /// Perform one layout iteration of the given section and return true
-  /// if any offsets were adjusted.
-  bool layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec);
-
   /// Perform relaxation on a single fragment - returns true if the fragment
   /// changes as a result of relaxation.
   bool relaxFragment(MCAsmLayout &Layout, MCFragment &F);
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index a9b19dc..67e10c3 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -70,9 +70,6 @@ private:
 
   FragmentType Kind;
 
-  /// Whether fragment is being laid out.
-  bool IsBeingLaidOut;
-
 protected:
   bool HasInstructions;
   bool LinkerRelaxable = false;
diff --git a/llvm/include/llvm/MC/MCInst.h b/llvm/include/llvm/MC/MCInst.h
index 2bc3108..578b732 100644
--- a/llvm/include/llvm/MC/MCInst.h
+++ b/llvm/include/llvm/MC/MCInst.h
@@ -189,7 +189,7 @@ class MCInst {
   unsigned Flags = 0;
 
   SMLoc Loc;
-  SmallVector<MCOperand, 10> Operands;
+  SmallVector<MCOperand, 6> Operands;
 
 public:
   MCInst() = default;
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 90bc48e..90effde 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -89,6 +89,8 @@ private:
   /// Whether this section has had instructions emitted into it.
   bool HasInstructions : 1;
 
+  bool HasLayout : 1;
+
   bool IsRegistered : 1;
 
   MCDummyFragment DummyFragment;
@@ -166,6 +168,9 @@ public:
   bool hasInstructions() const { return HasInstructions; }
   void setHasInstructions(bool Value) { HasInstructions = Value; }
 
+  bool hasLayout() const { return HasLayout; }
+  void setHasLayout(bool Value) { HasLayout = Value; }
+
   bool isRegistered() const { return IsRegistered; }
   void setIsRegistered(bool Value) { IsRegistered = Value; }
 
@@ -188,6 +193,8 @@ public:
   iterator end() { return Fragments.end(); }
   const_iterator end() const { return Fragments.end(); }
 
+  void addFragment(MCFragment &F) { Fragments.push_back(&F); }
+
   MCSection::iterator getSubsectionInsertionPoint(unsigned Subsection);
 
   void dump() const;
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 15b9eb6..d6831ee 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -729,23 +729,20 @@ Function* InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) {
 }
 
 GlobalVariable *InstrProfSymtab::getGlobalVariable(uint64_t MD5Hash) {
-  if (auto Iter = MD5VTableMap.find(MD5Hash); Iter != MD5VTableMap.end())
-    return Iter->second;
-  return nullptr;
+  return MD5VTableMap.lookup(MD5Hash);
 }
 
 // To store the sums of profile count values, or the percentage of
 // the sums of the total count values.
 struct CountSumOrPercent {
-  uint64_t NumEntries;
-  double CountSum;
-  double ValueCounts[IPVK_Last - IPVK_First + 1];
-  CountSumOrPercent() : NumEntries(0), CountSum(0.0f), ValueCounts() {}
+  uint64_t NumEntries = 0;
+  double CountSum = 0.0f;
+  std::array<double, IPVK_Last - IPVK_First + 1> ValueCounts = {};
+  CountSumOrPercent() = default;
   void reset() {
     NumEntries = 0;
     CountSum = 0.0f;
-    for (double &VC : ValueCounts)
-      VC = 0.0f;
+    ValueCounts.fill(0.0f);
   }
 };
 
@@ -761,15 +758,13 @@ struct OverlapStats {
   CountSumOrPercent Mismatch;
   CountSumOrPercent Unique;
   OverlapStatsLevel Level;
-  const std::string *BaseFilename;
-  const std::string *TestFilename;
+  const std::string *BaseFilename = nullptr;
+  const std::string *TestFilename = nullptr;
   StringRef FuncName;
-  uint64_t FuncHash;
-  bool Valid;
+  uint64_t FuncHash = 0;
+  bool Valid = false;
 
-  OverlapStats(OverlapStatsLevel L = ProgramLevel)
-      : Level(L), BaseFilename(nullptr), TestFilename(nullptr), FuncHash(0),
-        Valid(false) {}
+  OverlapStats(OverlapStatsLevel L = ProgramLevel) : Level(L) {}
 
   void dump(raw_fd_ostream &OS) const;
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 34dba87..3b307d0 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -693,22 +693,15 @@ private:
   /// Context sensitive profile summary data.
   std::unique_ptr<ProfileSummary> CS_Summary;
   IndexedMemProfReader MemProfReader;
-  /// VTableNamePtr points to the beginning of compressed vtable names.
-  /// When a symtab is constructed from profiles by llvm-profdata, the list of
-  /// names could be decompressed based on `VTableNamePtr` and
-  /// `CompressedVTableNamesLen`.
+  /// The compressed vtable names, to be used for symtab construction.
   /// A compiler that reads indexed profiles could construct symtab from module
   /// IR so it doesn't need the decompressed names.
-  const char *VTableNamePtr = nullptr;
-  /// The length of compressed vtable names.
-  uint64_t CompressedVTableNamesLen = 0;
-  /// Total size of binary ids.
-  uint64_t BinaryIdsSize{0};
-  /// Start address of binary id length and data pairs.
-  const uint8_t *BinaryIdsStart = nullptr;
+  StringRef VTableName;
+  /// A memory buffer holding binary ids.
+  ArrayRef<uint8_t> BinaryIdsBuffer;
 
   // Index to the current record in the record array.
-  unsigned RecordIndex;
+  unsigned RecordIndex = 0;
 
   // Read the profile summary. Return a pointer pointing to one byte past the
   // end of the summary data if it exists or the input \c Cur.
@@ -721,7 +714,7 @@ public:
       std::unique_ptr<MemoryBuffer> DataBuffer,
       std::unique_ptr<MemoryBuffer> RemappingBuffer = nullptr)
       : DataBuffer(std::move(DataBuffer)),
-        RemappingBuffer(std::move(RemappingBuffer)), RecordIndex(0) {}
+        RemappingBuffer(std::move(RemappingBuffer)) {}
   IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
   IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 0e4bb9c..53ddfd1 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -28,7 +28,8 @@ enum IndexedVersion : uint64_t {
   Version1 = 1,
   // Version 2: Added a call stack table.
   Version2 = 2,
-  // Version 3: Under development.
+  // Version 3: Added a radix tree for call stacks.  Switched to linear IDs for
+  // frames and call stacks.
   Version3 = 3,
 };
 
@@ -439,7 +440,7 @@ struct IndexedMemProfRecord {
   // on the schema provided in \p Schema.
   void serialize(const MemProfSchema &Schema, raw_ostream &OS,
                  IndexedVersion Version,
-                 llvm::DenseMap<memprof::CallStackId, LinearCallStackId>
+                 llvm::DenseMap<CallStackId, LinearCallStackId>
                      *MemProfCallStackIndexes = nullptr) const;
 
   // Deserializes memprof records from the Buffer.
@@ -579,15 +580,14 @@ private:
   IndexedVersion Version;
 
   // Mappings from CallStackId to the indexes into the call stack array.
-  llvm::DenseMap<memprof::CallStackId, LinearCallStackId>
-      *MemProfCallStackIndexes;
+  llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes;
 
 public:
   // We do not support the default constructor, which does not set Version.
   RecordWriterTrait() = delete;
-  RecordWriterTrait(const MemProfSchema *Schema, IndexedVersion V,
-                    llvm::DenseMap<memprof::CallStackId, LinearCallStackId>
-                        *MemProfCallStackIndexes)
+  RecordWriterTrait(
+      const MemProfSchema *Schema, IndexedVersion V,
+      llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes)
       : Schema(Schema), Version(V),
         MemProfCallStackIndexes(MemProfCallStackIndexes) {}
 
@@ -932,6 +932,18 @@ struct IndexedMemProfData {
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> CallStackData;
 };
 
+struct FrameStat {
+  // The number of occurrences of a given FrameId.
+  uint64_t Count = 0;
+  // The sum of indexes where a given FrameId shows up.
+  uint64_t PositionSum = 0;
+};
+
+// Compute a histogram of Frames in call stacks.
+llvm::DenseMap<FrameId, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+                          &MemProfCallStackData);
+
 // Construct a radix tree of call stacks.
 //
 // A set of call stacks might look like:
@@ -1029,7 +1041,8 @@ public:
   // Build a radix tree array.
   void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
                  &&MemProfCallStackData,
-             const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes);
+             const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
+             llvm::DenseMap<FrameId, FrameStat> &FrameHistogram);
 
   const std::vector<LinearFrameId> &getRadixArray() const { return RadixArray; }
 
diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index b42e4f5..fbba6483 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -76,8 +76,8 @@ public:
       Callback =
           std::bind(&MemProfReader::idToFrame, this, std::placeholders::_1);
 
-    memprof::CallStackIdConverter<decltype(CSIdToCallStack)> CSIdConv(
-        CSIdToCallStack, Callback);
+    CallStackIdConverter<decltype(CSIdToCallStack)> CSIdConv(CSIdToCallStack,
+                                                             Callback);
 
     const IndexedMemProfRecord &IndexedRecord = Iter->second;
     GuidRecord = {
@@ -137,7 +137,7 @@ class RawMemProfReader final : public MemProfReader {
 public:
   RawMemProfReader(const RawMemProfReader &) = delete;
   RawMemProfReader &operator=(const RawMemProfReader &) = delete;
-  virtual ~RawMemProfReader() override = default;
+  ~RawMemProfReader() override = default;
 
   // Prints the contents of the profile in YAML format.
   void printYAML(raw_ostream &OS);
@@ -161,7 +161,7 @@ public:
   // Returns a list of build ids recorded in the segment information.
   static std::vector<std::string> peekBuildIds(MemoryBuffer *DataBuffer);
 
-  virtual Error
+  Error
   readNextRecord(GuidMemProfRecordPair &GuidRecord,
                  std::function<const Frame(const FrameId)> Callback) override;
 
diff --git a/llvm/include/llvm/ProfileData/SampleProfWriter.h b/llvm/include/llvm/ProfileData/SampleProfWriter.h
index 963a4d4..5398a44 100644
--- a/llvm/include/llvm/ProfileData/SampleProfWriter.h
+++ b/llvm/include/llvm/ProfileData/SampleProfWriter.h
@@ -169,7 +169,7 @@ public:
 
 protected:
   SampleProfileWriterText(std::unique_ptr<raw_ostream> &OS)
-      : SampleProfileWriter(OS), Indent(0) {}
+      : SampleProfileWriter(OS) {}
 
   std::error_code writeHeader(const SampleProfileMap &ProfileMap) override {
     LineCount = 0;
@@ -180,7 +180,7 @@ private:
   /// Indent level to use when writing.
   ///
   /// This is used when printing inlined callees.
-  unsigned Indent;
+  unsigned Indent = 0;
 
   friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
   SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index 1e0d0ee..d190227 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -208,7 +208,7 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopPreheader() const {
     return nullptr;
 
   // Make sure there is only one exit out of the preheader.
-  if (llvm::size(llvm::children<BlockT *>(Out)) != 1)
+  if (!llvm::hasSingleElement(llvm::children<BlockT *>(Out)))
     return nullptr; // Multiple exits from the block, must not be a preheader.
 
   // The predecessor has exactly one successor, so it is a preheader.
diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index bd536d4..1121459 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -55,6 +55,10 @@ public:
     return Records.getAllDerivedDefinitions("Association");
   }
 
+  std::vector<Record *> getCategories() const {
+    return Records.getAllDerivedDefinitions("Category");
+  }
+
   std::vector<Record *> getDirectives() const {
     return Records.getAllDerivedDefinitions("Directive");
   }
@@ -131,6 +135,8 @@ public:
   }
 
   Record *getAssociation() const { return Def->getValueAsDef("association"); }
+
+  Record *getCategory() const { return Def->getValueAsDef("category"); }
 };
 
 // Wrapper class that contains Clause's information defined in DirectiveBase.td
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index afe6789..dcb00f6 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -132,48 +132,6 @@ struct ExtensionInfo {
 #define EMIT_EXTENSIONS
 #include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
-struct ExtensionSet {
-  // Set of extensions which are currently enabled.
-  ExtensionBitset Enabled;
-  // Set of extensions which have been enabled or disabled at any point. Used
-  // to avoid cluttering the cc1 command-line with lots of unneeded features.
-  ExtensionBitset Touched;
-  // Base architecture version, which we need to know because some feature
-  // dependencies change depending on this.
-  const ArchInfo *BaseArch;
-
-  ExtensionSet() : Enabled(), Touched(), BaseArch(nullptr) {}
-
-  // Enable the given architecture extension, and any other extensions it
-  // depends on. Does not change the base architecture, or follow dependencies
-  // between features which are only related by required arcitecture versions.
-  void enable(ArchExtKind E);
-
-  // Disable the given architecture extension, and any other extensions which
-  // depend on it. Does not change the base architecture, or follow
-  // dependencies between features which are only related by required
-  // arcitecture versions.
-  void disable(ArchExtKind E);
-
-  // Add default extensions for the given CPU. Records the base architecture,
-  // to later resolve dependencies which depend on it.
-  void addCPUDefaults(const CpuInfo &CPU);
-
-  // Add default extensions for the given architecture version. Records the
-  // base architecture, to later resolve dependencies which depend on it.
-  void addArchDefaults(const ArchInfo &Arch);
-
-  // Add or remove a feature based on a modifier string. The string must be of
-  // the form "<name>" to enable a feature or "no<name>" to disable it. This
-  // will also enable or disable any features as required by the dependencies
-  // between them.
-  bool parseModifier(StringRef Modifier);
-
-  // Convert the set of enabled extension to an LLVM feature list, appending
-  // them to Features.
-  void toLLVMFeatureList(std::vector<StringRef> &Features) const;
-};
-
 // Represents a dependency between two architecture extensions. Later is the
 // feature which was added to the architecture after Earlier, and expands the
 // functionality provided by it. If Later is enabled, then Earlier will also be
@@ -480,7 +438,7 @@ inline constexpr CpuInfo CpuInfos[] = {
      AArch64::ExtensionBitset({AArch64::AEK_AES, AArch64::AEK_SHA2,
                                AArch64::AEK_SHA3, AArch64::AEK_FP16,
                                AArch64::AEK_FP16FML})},
-    {"apple-a14", ARMV8_5A,
+    {"apple-a14", ARMV8_4A,
      AArch64::ExtensionBitset({AArch64::AEK_AES, AArch64::AEK_SHA2,
                                AArch64::AEK_SHA3, AArch64::AEK_FP16,
                                AArch64::AEK_FP16FML})},
@@ -497,7 +455,7 @@ inline constexpr CpuInfo CpuInfos[] = {
                                AArch64::AEK_SHA3, AArch64::AEK_FP16,
                                AArch64::AEK_FP16FML})},
 
-    {"apple-m1", ARMV8_5A,
+    {"apple-m1", ARMV8_4A,
      AArch64::ExtensionBitset({AArch64::AEK_AES, AArch64::AEK_SHA2,
                                AArch64::AEK_SHA3, AArch64::AEK_FP16,
                                AArch64::AEK_FP16FML})},
@@ -584,6 +542,65 @@ inline constexpr CpuInfo CpuInfos[] = {
                                 AArch64::AEK_PROFILE}))},
 };
 
+struct ExtensionSet {
+  // Set of extensions which are currently enabled.
+  ExtensionBitset Enabled;
+  // Set of extensions which have been enabled or disabled at any point. Used
+  // to avoid cluttering the cc1 command-line with lots of unneeded features.
+  ExtensionBitset Touched;
+  // Base architecture version, which we need to know because some feature
+  // dependencies change depending on this.
+  const ArchInfo *BaseArch;
+
+  ExtensionSet() : Enabled(), Touched(), BaseArch(nullptr) {}
+
+  // Enable the given architecture extension, and any other extensions it
+  // depends on. Does not change the base architecture, or follow dependencies
+  // between features which are only related by required arcitecture versions.
+  void enable(ArchExtKind E);
+
+  // Disable the given architecture extension, and any other extensions which
+  // depend on it. Does not change the base architecture, or follow
+  // dependencies between features which are only related by required
+  // arcitecture versions.
+  void disable(ArchExtKind E);
+
+  // Add default extensions for the given CPU. Records the base architecture,
+  // to later resolve dependencies which depend on it.
+  void addCPUDefaults(const CpuInfo &CPU);
+
+  // Add default extensions for the given architecture version. Records the
+  // base architecture, to later resolve dependencies which depend on it.
+  void addArchDefaults(const ArchInfo &Arch);
+
+  // Add or remove a feature based on a modifier string. The string must be of
+  // the form "<name>" to enable a feature or "no<name>" to disable it. This
+  // will also enable or disable any features as required by the dependencies
+  // between them.
+  bool parseModifier(StringRef Modifier, const bool AllowNoDashForm = false);
+
+  // Constructs a new ExtensionSet by toggling the corresponding bits for every
+  // feature in the \p Features list without expanding their dependencies. Used
+  // for reconstructing an ExtensionSet from the output of toLLVMFeatures().
+  void reconstructFromParsedFeatures(const std::vector<std::string> &Features);
+
+  // Convert the set of enabled extension to an LLVM feature list, appending
+  // them to Features.
+  template <typename T> void toLLVMFeatureList(std::vector<T> &Features) const {
+    if (BaseArch && !BaseArch->ArchFeature.empty())
+      Features.emplace_back(T(BaseArch->ArchFeature));
+
+    for (const auto &E : Extensions) {
+      if (E.Feature.empty() || !Touched.test(E.ID))
+        continue;
+      if (Enabled.test(E.ID))
+        Features.emplace_back(T(E.Feature));
+      else
+        Features.emplace_back(T(E.NegFeature));
+    }
+  }
+};
+
 // Name alias.
 struct Alias {
   StringRef AltName;
@@ -607,7 +624,13 @@ const ArchInfo *getArchForCpu(StringRef CPU);
 
 // Parser
 const ArchInfo *parseArch(StringRef Arch);
+
+// Return the extension which has the given -target-feature name.
+std::optional<ExtensionInfo> targetFeatureToExtension(StringRef TargetFeature);
+
+// Parse a name as defined by the Extension class in tablegen.
 std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
+
 // Given the name of a CPU or alias, return the correponding CpuInfo.
 std::optional<CpuInfo> parseCpu(StringRef Name);
 // Used by target parser tests
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 6cded82..8b6e56c 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1116,9 +1116,6 @@ LazyValueInfoImpl::getValueFromSimpleICmpCondition(CmpInst::Predicate Pred,
     if (!R)
       return std::nullopt;
     RHSRange = toConstantRange(*R, RHS->getType());
-  } else if (Instruction *I = dyn_cast<Instruction>(RHS)) {
-    if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
-      RHSRange = getConstantRangeFromMetadata(*Ranges);
   }
 
   ConstantRange TrueValues =
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index f0fde9a..eb1e3e4 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -74,23 +74,6 @@ static std::string getTypeString(Type *T) {
   return Tmp.str();
 }
 
-// Whatever debug info format we parsed, we should convert to the expected debug
-// info format immediately afterwards.
-bool LLParser::finalizeDebugInfoFormat(Module *M) {
-  // We should have already returned an error if we observed both intrinsics and
-  // records in this IR.
-  assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) &&
-         "Mixed debug intrinsics/records seen without a parsing error?");
-  if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) {
-    UseNewDbgInfoFormat = SeenNewDbgInfoFormat;
-    WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat;
-    WriteNewDbgInfoFormat = SeenNewDbgInfoFormat;
-  } else if (M) {
-    M->setIsNewDbgInfoFormat(false);
-  }
-  return false;
-}
-
 /// Run: module ::= toplevelentity*
 bool LLParser::Run(bool UpgradeDebugInfo,
                    DataLayoutCallbackTy DataLayoutCallback) {
@@ -108,7 +91,7 @@ bool LLParser::Run(bool UpgradeDebugInfo,
   }
 
   return parseTopLevelEntities() || validateEndOfModule(UpgradeDebugInfo) ||
-         validateEndOfIndex() || finalizeDebugInfoFormat(M);
+         validateEndOfIndex();
 }
 
 bool LLParser::parseStandaloneConstantValue(Constant *&C,
@@ -207,6 +190,18 @@ void LLParser::dropUnknownMetadataReferences() {
 bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   if (!M)
     return false;
+
+  // We should have already returned an error if we observed both intrinsics and
+  // records in this IR.
+  assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) &&
+         "Mixed debug intrinsics/records seen without a parsing error?");
+  if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) {
+    UseNewDbgInfoFormat = SeenNewDbgInfoFormat;
+    WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat;
+    WriteNewDbgInfoFormat = SeenNewDbgInfoFormat;
+    M->setNewDbgInfoFormatFlag(SeenNewDbgInfoFormat);
+  }
+
   // Handle any function attribute group forward references.
   for (const auto &RAG : ForwardRefAttrGroups) {
     Value *V = RAG.first;
@@ -439,6 +434,9 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeModuleFlags(*M);
   UpgradeSectionAttributes(*M);
 
+  if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE)
+    M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
+
   if (!Slots)
     return false;
   // Initialize the slot mapping.
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 8519796..026595b 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4355,7 +4355,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
   if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE) {
     TheModule->IsNewDbgInfoFormat =
         UseNewDbgInfoFormat &&
-        LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_TRUE;
+        LoadBitcodeIntoNewDbgInfoFormat != cl::boolOrDefault::BOU_FALSE;
   }
 
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 339a1f1..0e01080 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8866,7 +8866,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
         scaleWeights(NewTrueWeight, NewFalseWeight);
         Br1->setMetadata(LLVMContext::MD_prof,
                          MDBuilder(Br1->getContext())
-                             .createBranchWeights(TrueWeight, FalseWeight));
+                             .createBranchWeights(TrueWeight, FalseWeight,
+                                                  hasBranchWeightOrigin(*Br1)));
 
         NewTrueWeight = TrueWeight;
         NewFalseWeight = 2 * FalseWeight;
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index afe2703..37aa4e0 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -996,7 +996,6 @@ MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(
   LLT OldValResTy = OldValRes.getLLTTy(*getMRI());
   LLT AddrTy = Addr.getLLTTy(*getMRI());
   LLT ValTy = Val.getLLTTy(*getMRI());
-  assert(OldValResTy.isScalar() && "invalid operand type");
   assert(AddrTy.isPointer() && "invalid operand type");
   assert(ValTy.isValid() && "invalid operand type");
   assert(OldValResTy == ValTy && "type mismatch");
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index c67cc57..9fbb7b4 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Replaces LLVM IR instructions with vector operands (i.e., the frem
-// instruction or calls to LLVM intrinsics) with matching calls to functions
-// from a vector library (e.g libmvec, SVML) using TargetLibraryInfo interface.
+// Replaces calls to LLVM Intrinsics with matching calls to functions from a
+// vector library (e.g libmvec, SVML) using TargetLibraryInfo interface.
 //
 //===----------------------------------------------------------------------===//
 
@@ -25,6 +24,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/VFABIDemangler.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -70,84 +70,68 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
   return TLIFunc;
 }
 
-/// Replace the instruction \p I with a call to the corresponding function from
-/// the vector library (\p TLIVecFunc).
-static void replaceWithTLIFunction(Instruction &I, VFInfo &Info,
+/// Replace the intrinsic call \p II to \p TLIVecFunc, which is the
+/// corresponding function from the vector library.
+static void replaceWithTLIFunction(IntrinsicInst *II, VFInfo &Info,
                                    Function *TLIVecFunc) {
-  IRBuilder<> IRBuilder(&I);
-  auto *CI = dyn_cast<CallInst>(&I);
-  SmallVector<Value *> Args(CI ? CI->args() : I.operands());
+  IRBuilder<> IRBuilder(II);
+  SmallVector<Value *> Args(II->args());
   if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
     auto *MaskTy =
-        VectorType::get(Type::getInt1Ty(I.getContext()), Info.Shape.VF);
+        VectorType::get(Type::getInt1Ty(II->getContext()), Info.Shape.VF);
     Args.insert(Args.begin() + OptMaskpos.value(),
                 Constant::getAllOnesValue(MaskTy));
   }
 
-  // If it is a call instruction, preserve the operand bundles.
+  // Preserve the operand bundles.
   SmallVector<OperandBundleDef, 1> OpBundles;
-  if (CI)
-    CI->getOperandBundlesAsDefs(OpBundles);
+  II->getOperandBundlesAsDefs(OpBundles);
 
   auto *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles);
-  I.replaceAllUsesWith(Replacement);
+  II->replaceAllUsesWith(Replacement);
   // Preserve fast math flags for FP math.
   if (isa<FPMathOperator>(Replacement))
-    Replacement->copyFastMathFlags(&I);
+    Replacement->copyFastMathFlags(II);
 }
 
-/// Returns true when successfully replaced \p I with a suitable function taking
-/// vector arguments, based on available mappings in the \p TLI. Currently only
-/// works when \p I is a call to vectorized intrinsic or the frem instruction.
+/// Returns true when successfully replaced \p II, which is a call to a
+/// vectorized intrinsic, with a suitable function taking vector arguments,
+/// based on available mappings in the \p TLI.
 static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
-                                    Instruction &I) {
+                                    IntrinsicInst *II) {
+  assert(II != nullptr && "Intrinsic cannot be null");
   // At the moment VFABI assumes the return type is always widened unless it is
   // a void type.
-  auto *VTy = dyn_cast<VectorType>(I.getType());
+  auto *VTy = dyn_cast<VectorType>(II->getType());
   ElementCount EC(VTy ? VTy->getElementCount() : ElementCount::getFixed(0));
-
-  // Compute the argument types of the corresponding scalar call and the scalar
-  // function name. For calls, it additionally finds the function to replace
-  // and checks that all vector operands match the previously found EC.
+  // Compute the argument types of the corresponding scalar call and check that
+  // all vector operands match the previously found EC.
   SmallVector<Type *, 8> ScalarArgTypes;
-  std::string ScalarName;
-  Function *FuncToReplace = nullptr;
-  auto *CI = dyn_cast<CallInst>(&I);
-  if (CI) {
-    FuncToReplace = CI->getCalledFunction();
-    Intrinsic::ID IID = FuncToReplace->getIntrinsicID();
-    assert(IID != Intrinsic::not_intrinsic && "Not an intrinsic");
-    for (auto Arg : enumerate(CI->args())) {
-      auto *ArgTy = Arg.value()->getType();
-      if (isVectorIntrinsicWithScalarOpAtArg(IID, Arg.index())) {
-        ScalarArgTypes.push_back(ArgTy);
-      } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
-        ScalarArgTypes.push_back(VectorArgTy->getElementType());
-        // When return type is void, set EC to the first vector argument, and
-        // disallow vector arguments with different ECs.
-        if (EC.isZero())
-          EC = VectorArgTy->getElementCount();
-        else if (EC != VectorArgTy->getElementCount())
-          return false;
-      } else
-        // Exit when it is supposed to be a vector argument but it isn't.
+  Intrinsic::ID IID = II->getIntrinsicID();
+  for (auto Arg : enumerate(II->args())) {
+    auto *ArgTy = Arg.value()->getType();
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, Arg.index())) {
+      ScalarArgTypes.push_back(ArgTy);
+    } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
+      ScalarArgTypes.push_back(VectorArgTy->getElementType());
+      // When return type is void, set EC to the first vector argument, and
+      // disallow vector arguments with different ECs.
+      if (EC.isZero())
+        EC = VectorArgTy->getElementCount();
+      else if (EC != VectorArgTy->getElementCount())
         return false;
-    }
-    // Try to reconstruct the name for the scalar version of the instruction,
-    // using scalar argument types.
-    ScalarName = Intrinsic::isOverloaded(IID)
-                     ? Intrinsic::getName(IID, ScalarArgTypes, I.getModule())
-                     : Intrinsic::getName(IID).str();
-  } else {
-    assert(VTy && "Return type must be a vector");
-    auto *ScalarTy = VTy->getScalarType();
-    LibFunc Func;
-    if (!TLI.getLibFunc(I.getOpcode(), ScalarTy, Func))
+    } else
+      // Exit when it is supposed to be a vector argument but it isn't.
       return false;
-    ScalarName = TLI.getName(Func);
-    ScalarArgTypes = {ScalarTy, ScalarTy};
   }
 
+  // Try to reconstruct the name for the scalar version of the instruction,
+  // using scalar argument types.
+  std::string ScalarName =
+      Intrinsic::isOverloaded(IID)
+          ? Intrinsic::getName(IID, ScalarArgTypes, II->getModule())
+          : Intrinsic::getName(IID).str();
+
   // Try to find the mapping for the scalar version of this intrinsic and the
   // exact vector width of the call operands in the TargetLibraryInfo. First,
   // check with a non-masked variant, and if that fails try with a masked one.
@@ -162,7 +146,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
 
   // Replace the call to the intrinsic with a call to the vector library
   // function.
-  Type *ScalarRetTy = I.getType()->getScalarType();
+  Type *ScalarRetTy = II->getType()->getScalarType();
   FunctionType *ScalarFTy =
       FunctionType::get(ScalarRetTy, ScalarArgTypes, /*isVarArg*/ false);
   const std::string MangledName = VD->getVectorFunctionABIVariantString();
@@ -174,22 +158,19 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // specification when being created, this is why we need to add extra check to
   // make sure that the operands of the vector function obtained via VFABI match
   // the operands of the original vector instruction.
-  if (CI) {
-    for (auto &VFParam : OptInfo->Shape.Parameters) {
-      if (VFParam.ParamKind == VFParamKind::GlobalPredicate)
-        continue;
+  for (auto &VFParam : OptInfo->Shape.Parameters) {
+    if (VFParam.ParamKind == VFParamKind::GlobalPredicate)
+      continue;
 
-      // tryDemangleForVFABI must return valid ParamPos, otherwise it could be
-      // a bug in the VFABI parser.
-      assert(VFParam.ParamPos < CI->arg_size() &&
-             "ParamPos has invalid range.");
-      Type *OrigTy = CI->getArgOperand(VFParam.ParamPos)->getType();
-      if (OrigTy->isVectorTy() != (VFParam.ParamKind == VFParamKind::Vector)) {
-        LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Will not replace: " << ScalarName
-                          << ". Wrong type at index " << VFParam.ParamPos
-                          << ": " << *OrigTy << "\n");
-        return false;
-      }
+    // tryDemangleForVFABI must return valid ParamPos, otherwise it could be
+    // a bug in the VFABI parser.
+    assert(VFParam.ParamPos < II->arg_size() && "ParamPos has invalid range");
+    Type *OrigTy = II->getArgOperand(VFParam.ParamPos)->getType();
+    if (OrigTy->isVectorTy() != (VFParam.ParamKind == VFParamKind::Vector)) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Will not replace: " << ScalarName
+                        << ". Wrong type at index " << VFParam.ParamPos << ": "
+                        << *OrigTy << "\n");
+      return false;
     }
   }
 
@@ -197,45 +178,32 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   if (!VectorFTy)
     return false;
 
-  Function *TLIFunc = getTLIFunction(I.getModule(), VectorFTy,
-                                     VD->getVectorFnName(), FuncToReplace);
-
-  replaceWithTLIFunction(I, *OptInfo, TLIFunc);
+  Function *TLIFunc =
+      getTLIFunction(II->getModule(), VectorFTy, VD->getVectorFnName(),
+                     II->getCalledFunction());
+  replaceWithTLIFunction(II, *OptInfo, TLIFunc);
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" << ScalarName
                     << "` with call to `" << TLIFunc->getName() << "`.\n");
   ++NumCallsReplaced;
   return true;
 }
 
-/// Supported instruction \p I must be a vectorized frem or a call to an
-/// intrinsic that returns either void or a vector.
-static bool isSupportedInstruction(Instruction *I) {
-  Type *Ty = I->getType();
-  if (auto *CI = dyn_cast<CallInst>(I))
-    return (Ty->isVectorTy() || Ty->isVoidTy()) && CI->getCalledFunction() &&
-           CI->getCalledFunction()->getIntrinsicID() !=
-               Intrinsic::not_intrinsic;
-  if (I->getOpcode() == Instruction::FRem && Ty->isVectorTy())
-    return true;
-  return false;
-}
-
 static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
-  bool Changed = false;
   SmallVector<Instruction *> ReplacedCalls;
   for (auto &I : instructions(F)) {
-    if (!isSupportedInstruction(&I))
-      continue;
-    if (replaceWithCallToVeclib(TLI, I)) {
-      ReplacedCalls.push_back(&I);
-      Changed = true;
+    // Process only intrinsic calls that return void or a vector.
+    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      if (!II->getType()->isVectorTy() && !II->getType()->isVoidTy())
+        continue;
+
+      if (replaceWithCallToVeclib(TLI, II))
+        ReplacedCalls.push_back(&I);
     }
   }
-  // Erase the calls to the intrinsics that have been replaced
-  // with calls to the vector library.
-  for (auto *CI : ReplacedCalls)
-    CI->eraseFromParent();
-  return Changed;
+  // Erase any intrinsic calls that were replaced with vector library calls.
+  for (auto *I : ReplacedCalls)
+    I->eraseFromParent();
+  return !ReplacedCalls.empty();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -246,7 +214,7 @@ PreservedAnalyses ReplaceWithVeclib::run(Function &F,
   const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto Changed = runImpl(TLI, F);
   if (Changed) {
-    LLVM_DEBUG(dbgs() << "Instructions replaced with vector libraries: "
+    LLVM_DEBUG(dbgs() << "Intrinsic calls replaced with vector libraries: "
                       << NumCallsReplaced << "\n");
 
     PreservedAnalyses PA;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2d5968b..4fcbe08 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10112,7 +10112,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl X, cttz(Y)) -> (mul (Y & -Y), X) if cttz is unsupported on the
   // target.
   if (((N1.getOpcode() == ISD::CTTZ &&
-        VT.getScalarSizeInBits() >= ShiftVT.getScalarSizeInBits()) ||
+        VT.getScalarSizeInBits() <= ShiftVT.getScalarSizeInBits()) ||
        N1.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
       N1.hasOneUse() && !TLI.isOperationLegalOrCustom(ISD::CTTZ, ShiftVT) &&
       TLI.isOperationLegalOrCustom(ISD::MUL, VT)) {
@@ -17262,26 +17262,29 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (SDValue V = combineRepeatedFPDivisors(N))
     return V;
 
-  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
-    // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
-    if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
-      // Compute the reciprocal 1.0 / c2.
-      const APFloat &N1APF = N1CFP->getValueAPF();
-      APFloat Recip(N1APF.getSemantics(), 1); // 1.0
-      APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
-      // Only do the transform if the reciprocal is a legal fp immediate that
-      // isn't too nasty (eg NaN, denormal, ...).
-      if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
-          (!LegalOperations ||
-           // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
-           // backend)... we should handle this gracefully after Legalize.
-           // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
-           TLI.isOperationLegal(ISD::ConstantFP, VT) ||
-           TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
-        return DAG.getNode(ISD::FMUL, DL, VT, N0,
-                           DAG.getConstantFP(Recip, DL, VT));
-    }
+  // fold (fdiv X, c2) -> (fmul X, 1/c2) if there is no loss in precision, or
+  // the loss is acceptable with AllowReciprocal.
+  if (auto *N1CFP = isConstOrConstSplatFP(N1, true)) {
+    // Compute the reciprocal 1.0 / c2.
+    const APFloat &N1APF = N1CFP->getValueAPF();
+    APFloat Recip = APFloat::getOne(N1APF.getSemantics());
+    APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+    // Only do the transform if the reciprocal is a legal fp immediate that
+    // isn't too nasty (eg NaN, denormal, ...).
+    if (((st == APFloat::opOK && !Recip.isDenormal()) ||
+         (st == APFloat::opInexact &&
+          (Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
+        (!LegalOperations ||
+         // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+         // backend)... we should handle this gracefully after Legalize.
+         // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
+         TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+         TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
+      return DAG.getNode(ISD::FMUL, DL, VT, N0,
+                         DAG.getConstantFP(Recip, DL, VT));
+  }
 
+  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
@@ -26586,7 +26589,12 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
     }
   }
 
-  return SDValue();
+  // Sometimes constants manage to survive very late in the pipeline, e.g.,
+  // because they are wrapped inside the <1 x f16> type. Try one last time to
+  // get rid of them.
+  SDValue Folded = DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N),
+                                              N->getValueType(0), {N0});
+  return Folded;
 }
 
 SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ddb8a0e..523d3ae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6551,17 +6551,17 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
 
   ElementCount NumElts = VT.getVectorElementCount();
 
-  // See if we can fold through bitcasted integer ops.
+  // See if we can fold through any bitcasted integer ops.
   if (NumOps == 2 && VT.isFixedLengthVector() && VT.isInteger() &&
       Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
-      Ops[0].getOpcode() == ISD::BITCAST &&
-      Ops[1].getOpcode() == ISD::BITCAST) {
+      (Ops[0].getOpcode() == ISD::BITCAST ||
+       Ops[1].getOpcode() == ISD::BITCAST)) {
     SDValue N1 = peekThroughBitcasts(Ops[0]);
     SDValue N2 = peekThroughBitcasts(Ops[1]);
     auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
     auto *BV2 = dyn_cast<BuildVectorSDNode>(N2);
-    EVT BVVT = N1.getValueType();
-    if (BV1 && BV2 && BVVT.isInteger() && BVVT == N2.getValueType()) {
+    if (BV1 && BV2 && N1.getValueType().isInteger() &&
+        N2.getValueType().isInteger()) {
       bool IsLE = getDataLayout().isLittleEndian();
       unsigned EltBits = VT.getScalarSizeInBits();
       SmallVector<APInt> RawBits1, RawBits2;
@@ -6577,15 +6577,22 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
           RawBits.push_back(*Fold);
         }
         if (RawBits.size() == NumElts.getFixedValue()) {
-          // We have constant folded, but we need to cast this again back to
-          // the original (possibly legalized) type.
+          // We have constant folded, but we might need to cast this again back
+          // to the original (possibly legalized) type.
+          EVT BVVT, BVEltVT;
+          if (N1.getValueType() == VT) {
+            BVVT = N1.getValueType();
+            BVEltVT = BV1->getOperand(0).getValueType();
+          } else {
+            BVVT = N2.getValueType();
+            BVEltVT = BV2->getOperand(0).getValueType();
+          }
+          unsigned BVEltBits = BVEltVT.getSizeInBits();
           SmallVector<APInt> DstBits;
           BitVector DstUndefs;
           BuildVectorSDNode::recastRawBits(IsLE, BVVT.getScalarSizeInBits(),
                                            DstBits, RawBits, DstUndefs,
                                            BitVector(RawBits.size(), false));
-          EVT BVEltVT = BV1->getOperand(0).getValueType();
-          unsigned BVEltBits = BVEltVT.getSizeInBits();
           SmallVector<SDValue> Ops(DstBits.size(), getUNDEF(BVEltVT));
           for (unsigned I = 0, E = DstBits.size(); I != E; ++I) {
             if (DstUndefs[I])
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 29f2cbf..aea9425 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -181,7 +181,7 @@ template class llvm::SymbolTableListTraits<Instruction,
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
     : Value(Type::getLabelTy(C), Value::BasicBlockVal),
-      IsNewDbgInfoFormat(false), Parent(nullptr) {
+      IsNewDbgInfoFormat(UseNewDbgInfoFormat), Parent(nullptr) {
 
   if (NewParent)
     insertInto(NewParent, InsertBefore);
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 7b8b9a7..77a8336 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1416,64 +1416,16 @@ static Constant *foldGEPOfGEP(GEPOperator *GEP, Type *PointeeTy, bool InBounds,
   if (GEP->getInRange())
     return nullptr;
 
+  // Only handle simple case with leading zero index. We cannot perform an
+  // actual addition as we don't know the correct index type size to use.
   Constant *Idx0 = cast<Constant>(Idxs[0]);
-  if (Idx0->isNullValue()) {
-    // Handle the simple case of a zero index.
-    SmallVector<Value*, 16> NewIndices;
-    NewIndices.reserve(Idxs.size() + GEP->getNumIndices());
-    NewIndices.append(GEP->idx_begin(), GEP->idx_end());
-    NewIndices.append(Idxs.begin() + 1, Idxs.end());
-    return ConstantExpr::getGetElementPtr(
-        GEP->getSourceElementType(), cast<Constant>(GEP->getPointerOperand()),
-        NewIndices, InBounds && GEP->isInBounds());
-  }
-
-  gep_type_iterator LastI = gep_type_end(GEP);
-  for (gep_type_iterator I = gep_type_begin(GEP), E = gep_type_end(GEP);
-       I != E; ++I)
-    LastI = I;
-
-  // We can't combine GEPs if the last index is a struct type.
-  if (!LastI.isSequential())
-    return nullptr;
-  // We could perform the transform with non-constant index, but prefer leaving
-  // it as GEP of GEP rather than GEP of add for now.
-  ConstantInt *CI = dyn_cast<ConstantInt>(Idx0);
-  if (!CI)
-    return nullptr;
-
-  // TODO: This code may be extended to handle vectors as well.
-  auto *LastIdx = cast<Constant>(GEP->getOperand(GEP->getNumOperands()-1));
-  Type *LastIdxTy = LastIdx->getType();
-  if (LastIdxTy->isVectorTy())
+  if (!Idx0->isNullValue())
     return nullptr;
 
   SmallVector<Value*, 16> NewIndices;
   NewIndices.reserve(Idxs.size() + GEP->getNumIndices());
-  NewIndices.append(GEP->idx_begin(), GEP->idx_end() - 1);
-
-  // Add the last index of the source with the first index of the new GEP.
-  // Make sure to handle the case when they are actually different types.
-  if (LastIdxTy != Idx0->getType()) {
-    unsigned CommonExtendedWidth =
-        std::max(LastIdxTy->getIntegerBitWidth(),
-                 Idx0->getType()->getIntegerBitWidth());
-    CommonExtendedWidth = std::max(CommonExtendedWidth, 64U);
-
-    Type *CommonTy =
-        Type::getIntNTy(LastIdxTy->getContext(), CommonExtendedWidth);
-    if (Idx0->getType() != CommonTy)
-      Idx0 = ConstantFoldCastInstruction(Instruction::SExt, Idx0, CommonTy);
-    if (LastIdx->getType() != CommonTy)
-      LastIdx =
-          ConstantFoldCastInstruction(Instruction::SExt, LastIdx, CommonTy);
-    if (!Idx0 || !LastIdx)
-      return nullptr;
-  }
-
-  NewIndices.push_back(ConstantExpr::get(Instruction::Add, Idx0, LastIdx));
+  NewIndices.append(GEP->idx_begin(), GEP->idx_end());
   NewIndices.append(Idxs.begin() + 1, Idxs.end());
-
   return ConstantExpr::getGetElementPtr(
       GEP->getSourceElementType(), cast<Constant>(GEP->getPointerOperand()),
       NewIndices, InBounds && GEP->isInBounds());
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 4a508a0..3a91a4e 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -3137,16 +3137,35 @@ LLVMBuilderRef LLVMCreateBuilder(void) {
   return LLVMCreateBuilderInContext(LLVMGetGlobalContext());
 }
 
+static void LLVMPositionBuilderImpl(IRBuilder<> *Builder, BasicBlock *Block,
+                                    Instruction *Instr, bool BeforeDbgRecords) {
+  BasicBlock::iterator I = Instr ? Instr->getIterator() : Block->end();
+  I.setHeadBit(BeforeDbgRecords);
+  Builder->SetInsertPoint(Block, I);
+}
+
 void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block,
                          LLVMValueRef Instr) {
-  BasicBlock *BB = unwrap(Block);
-  auto I = Instr ? unwrap<Instruction>(Instr)->getIterator() : BB->end();
-  unwrap(Builder)->SetInsertPoint(BB, I);
+  return LLVMPositionBuilderImpl(unwrap(Builder), unwrap(Block),
+                                 unwrap<Instruction>(Instr), false);
+}
+
+void LLVMPositionBuilderBeforeDbgRecords(LLVMBuilderRef Builder,
+                                         LLVMBasicBlockRef Block,
+                                         LLVMValueRef Instr) {
+  return LLVMPositionBuilderImpl(unwrap(Builder), unwrap(Block),
+                                 unwrap<Instruction>(Instr), true);
 }
 
 void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr) {
   Instruction *I = unwrap<Instruction>(Instr);
-  unwrap(Builder)->SetInsertPoint(I->getParent(), I->getIterator());
+  return LLVMPositionBuilderImpl(unwrap(Builder), I->getParent(), I, false);
+}
+
+void LLVMPositionBuilderBeforeInstrAndDbgRecords(LLVMBuilderRef Builder,
+                                                 LLVMValueRef Instr) {
+  Instruction *I = unwrap<Instruction>(Instr);
+  return LLVMPositionBuilderImpl(unwrap(Builder), I->getParent(), I, true);
 }
 
 void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block) {
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index fbca7cd..9a4926c 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -366,8 +366,8 @@ void DbgVariableRecord::setKillLocation() {
 }
 
 bool DbgVariableRecord::isKillLocation() const {
-  return (getNumVariableLocationOps() == 0 &&
-          !getExpression()->isComplex()) ||
+  return (!hasArgList() && isa<MDNode>(getRawLocation())) ||
+         (getNumVariableLocationOps() == 0 && !getExpression()->isComplex()) ||
          any_of(location_ops(), [](Value *V) { return isa<UndefValue>(V); });
 }
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 3f73502..9360e6d 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -83,6 +83,8 @@ static cl::opt<int> NonGlobalValueMaxNameSize(
     "non-global-value-max-name-size", cl::Hidden, cl::init(1024),
     cl::desc("Maximum size for the name of non-global values."));
 
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
 void Function::convertToNewDbgValues() {
   IsNewDbgInfoFormat = true;
   for (auto &BB : *this) {
@@ -441,7 +443,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
     : GlobalObject(Ty, Value::FunctionVal,
                    OperandTraits<Function>::op_begin(this), 0, Linkage, name,
                    computeAddrSpace(AddrSpace, ParentModule)),
-      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(false) {
+      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 29272e6..aec927a 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1268,12 +1268,23 @@ Instruction *Instruction::cloneImpl() const {
 
 void Instruction::swapProfMetadata() {
   MDNode *ProfileData = getBranchWeightMDNode(*this);
-  if (!ProfileData || ProfileData->getNumOperands() != 3)
+  if (!ProfileData)
+    return;
+  unsigned FirstIdx = getBranchWeightOffset(ProfileData);
+  if (ProfileData->getNumOperands() != 2 + FirstIdx)
     return;
 
-  // The first operand is the name. Fetch them backwards and build a new one.
-  Metadata *Ops[] = {ProfileData->getOperand(0), ProfileData->getOperand(2),
-                     ProfileData->getOperand(1)};
+  unsigned SecondIdx = FirstIdx + 1;
+  SmallVector<Metadata *, 4> Ops;
+  // If there are more weights past the second, we can't swap them
+  if (ProfileData->getNumOperands() > SecondIdx + 1)
+    return;
+  for (unsigned Idx = 0; Idx < FirstIdx; ++Idx) {
+    Ops.push_back(ProfileData->getOperand(Idx));
+  }
+  // Switch the order of the weights
+  Ops.push_back(ProfileData->getOperand(SecondIdx));
+  Ops.push_back(ProfileData->getOperand(FirstIdx));
   setMetadata(LLVMContext::MD_prof,
               MDNode::get(ProfileData->getContext(), Ops));
 }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 1213f07..de369bd 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -5199,7 +5199,11 @@ void SwitchInstProfUpdateWrapper::init() {
   if (!ProfileData)
     return;
 
-  if (ProfileData->getNumOperands() != SI.getNumSuccessors() + 1) {
+  // FIXME: This check belongs in ProfDataUtils. Its almost equivalent to
+  // getValidBranchWeightMDNode(), but the need to use llvm_unreachable
+  // makes them slightly different.
+  if (ProfileData->getNumOperands() !=
+      SI.getNumSuccessors() + getBranchWeightOffset(ProfileData)) {
     llvm_unreachable("number of prof branch_weights metadata operands does "
                      "not correspond to number of succesors");
   }
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index bd68db3..0000277 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -35,8 +35,8 @@ MDNode *MDBuilder::createFPMath(float Accuracy) {
 }
 
 MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight,
-                                       uint32_t FalseWeight) {
-  return createBranchWeights({TrueWeight, FalseWeight});
+                                       uint32_t FalseWeight, bool IsExpected) {
+  return createBranchWeights({TrueWeight, FalseWeight}, IsExpected);
 }
 
 MDNode *MDBuilder::createLikelyBranchWeights() {
@@ -49,15 +49,19 @@ MDNode *MDBuilder::createUnlikelyBranchWeights() {
   return createBranchWeights(1, (1U << 20) - 1);
 }
 
-MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) {
+MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights,
+                                       bool IsExpected) {
   assert(Weights.size() >= 1 && "Need at least one branch weights!");
 
-  SmallVector<Metadata *, 4> Vals(Weights.size() + 1);
+  unsigned int Offset = IsExpected ? 2 : 1;
+  SmallVector<Metadata *, 4> Vals(Weights.size() + Offset);
   Vals[0] = createString("branch_weights");
+  if (IsExpected)
+    Vals[1] = createString("expected");
 
   Type *Int32Ty = Type::getInt32Ty(Context);
   for (unsigned i = 0, e = Weights.size(); i != e; ++i)
-    Vals[i + 1] = createConstant(ConstantInt::get(Int32Ty, Weights[i]));
+    Vals[i + Offset] = createConstant(ConstantInt::get(Int32Ty, Weights[i]));
 
   return MDNode::get(Context, Vals);
 }
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index b6c9324..5f42ce2 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1196,10 +1196,10 @@ MDNode *MDNode::mergeDirectCallProfMetadata(MDNode *A, MDNode *B,
   StringRef AProfName = AMDS->getString();
   StringRef BProfName = BMDS->getString();
   if (AProfName == "branch_weights" && BProfName == "branch_weights") {
-    ConstantInt *AInstrWeight =
-        mdconst::dyn_extract<ConstantInt>(A->getOperand(1));
-    ConstantInt *BInstrWeight =
-        mdconst::dyn_extract<ConstantInt>(B->getOperand(1));
+    ConstantInt *AInstrWeight = mdconst::dyn_extract<ConstantInt>(
+        A->getOperand(getBranchWeightOffset(A)));
+    ConstantInt *BInstrWeight = mdconst::dyn_extract<ConstantInt>(
+        B->getOperand(getBranchWeightOffset(B)));
     assert(AInstrWeight && BInstrWeight && "verified by LLVM verifier");
     return MDNode::get(Ctx,
                        {MDHelper.createString("branch_weights"),
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index f97dd18..55c282c 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -54,6 +54,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
 //===----------------------------------------------------------------------===//
 // Methods to implement the globals and functions lists.
 //
@@ -72,7 +74,7 @@ template class llvm::SymbolTableListTraits<GlobalIFunc>;
 Module::Module(StringRef MID, LLVMContext &C)
     : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>(-1)),
       ModuleID(std::string(MID)), SourceFileName(std::string(MID)), DL(""),
-      IsNewDbgInfoFormat(false) {
+      IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
   Context.addModule(this);
 }
 
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 51e78dc..c4b1ed5 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -40,9 +40,6 @@ namespace {
 // We maintain some constants here to ensure that we access the branch weights
 // correctly, and can change the behavior in the future if the layout changes
 
-// The index at which the weights vector starts
-constexpr unsigned WeightsIdx = 1;
-
 // the minimum number of operands for MD_prof nodes with branch weights
 constexpr unsigned MinBWOps = 3;
 
@@ -75,6 +72,7 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
   assert(isBranchWeightMD(ProfileData) && "wrong metadata");
 
   unsigned NOps = ProfileData->getNumOperands();
+  unsigned WeightsIdx = getBranchWeightOffset(ProfileData);
   assert(WeightsIdx < NOps && "Weights Index must be less than NOps.");
   Weights.resize(NOps - WeightsIdx);
 
@@ -82,8 +80,8 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
     ConstantInt *Weight =
         mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
     assert(Weight && "Malformed branch_weight in MD_prof node");
-    assert(Weight->getValue().getActiveBits() <= 32 &&
-           "Too many bits for uint32_t");
+    assert(Weight->getValue().getActiveBits() <= (sizeof(T) * 8) &&
+           "Too many bits for MD_prof branch_weight");
     Weights[Idx - WeightsIdx] = Weight->getZExtValue();
   }
 }
@@ -123,6 +121,26 @@ bool hasValidBranchWeightMD(const Instruction &I) {
   return getValidBranchWeightMDNode(I);
 }
 
+bool hasBranchWeightOrigin(const Instruction &I) {
+  auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
+  return hasBranchWeightOrigin(ProfileData);
+}
+
+bool hasBranchWeightOrigin(const MDNode *ProfileData) {
+  if (!isBranchWeightMD(ProfileData))
+    return false;
+  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(1));
+  // NOTE: if we ever have more types of branch weight provenance,
+  // we need to check the string value is "expected". For now, we
+  // supply a more generic API, and avoid the spurious comparisons.
+  assert(ProfDataName == nullptr || ProfDataName->getString() == "expected");
+  return ProfDataName != nullptr;
+}
+
+unsigned getBranchWeightOffset(const MDNode *ProfileData) {
+  return hasBranchWeightOrigin(ProfileData) ? 2 : 1;
+}
+
 MDNode *getBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   if (!isBranchWeightMD(ProfileData))
@@ -132,7 +150,9 @@ MDNode *getBranchWeightMDNode(const Instruction &I) {
 
 MDNode *getValidBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = getBranchWeightMDNode(I);
-  if (ProfileData && ProfileData->getNumOperands() == 1 + I.getNumSuccessors())
+  auto Offset = getBranchWeightOffset(ProfileData);
+  if (ProfileData &&
+      ProfileData->getNumOperands() == Offset + I.getNumSuccessors())
     return ProfileData;
   return nullptr;
 }
@@ -191,7 +211,8 @@ bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalVal) {
     return false;
 
   if (ProfDataName->getString() == "branch_weights") {
-    for (unsigned Idx = 1; Idx < ProfileData->getNumOperands(); Idx++) {
+    unsigned Offset = getBranchWeightOffset(ProfileData);
+    for (unsigned Idx = Offset; Idx < ProfileData->getNumOperands(); ++Idx) {
       auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
       assert(V && "Malformed branch_weight in MD_prof node");
       TotalVal += V->getValue().getZExtValue();
@@ -212,9 +233,10 @@ bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalVal) {
   return extractProfTotalWeight(I.getMetadata(LLVMContext::MD_prof), TotalVal);
 }
 
-void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights) {
+void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
+                      bool IsExpected) {
   MDBuilder MDB(I.getContext());
-  MDNode *BranchWeights = MDB.createBranchWeights(Weights);
+  MDNode *BranchWeights = MDB.createBranchWeights(Weights, IsExpected);
   I.setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e592720..fe2253d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -104,6 +104,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -4808,8 +4809,10 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
 
   // Check consistency of !prof branch_weights metadata.
   if (ProfName == "branch_weights") {
+    unsigned int Offset = getBranchWeightOffset(MD);
     if (isa<InvokeInst>(&I)) {
-      Check(MD->getNumOperands() == 2 || MD->getNumOperands() == 3,
+      Check(MD->getNumOperands() == (1 + Offset) ||
+                MD->getNumOperands() == (2 + Offset),
             "Wrong number of InvokeInst branch_weights operands", MD);
     } else {
       unsigned ExpectedNumOperands = 0;
@@ -4829,10 +4832,10 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
         CheckFailed("!prof branch_weights are not allowed for this instruction",
                     MD);
 
-      Check(MD->getNumOperands() == 1 + ExpectedNumOperands,
+      Check(MD->getNumOperands() == Offset + ExpectedNumOperands,
             "Wrong number of operands", MD);
     }
-    for (unsigned i = 1; i < MD->getNumOperands(); ++i) {
+    for (unsigned i = Offset; i < MD->getNumOperands(); ++i) {
       auto &MDO = MD->getOperand(i);
       Check(MDO, "second operand should not be null", MD);
       Check(mdconst::dyn_extract<ConstantInt>(MDO),
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index ad30b5c..a7157e7 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -66,7 +66,6 @@ STATISTIC(EmittedFillFragments,
 STATISTIC(EmittedNopsFragments, "Number of emitted assembler fragments - nops");
 STATISTIC(EmittedOrgFragments, "Number of emitted assembler fragments - org");
 STATISTIC(evaluateFixup, "Number of evaluated fixups");
-STATISTIC(FragmentLayouts, "Number of fragment layouts");
 STATISTIC(ObjectBytes, "Number of emitted object file bytes");
 STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps");
 STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
@@ -404,29 +403,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   llvm_unreachable("invalid fragment kind");
 }
 
-void MCAsmLayout::layoutFragment(MCFragment *F) {
-  MCFragment *Prev = F->getPrevNode();
-
-  // We should never try to recompute something which is valid.
-  assert(!isFragmentValid(F) && "Attempt to recompute a valid fragment!");
-  // We should never try to compute the fragment layout if its predecessor
-  // isn't valid.
-  assert((!Prev || isFragmentValid(Prev)) &&
-         "Attempt to compute fragment before its predecessor!");
-
-  assert(!F->IsBeingLaidOut && "Already being laid out!");
-  F->IsBeingLaidOut = true;
-
-  ++stats::FragmentLayouts;
-
-  // Compute fragment offset and size.
-  if (Prev)
-    F->Offset = Prev->Offset + getAssembler().computeFragmentSize(*this, *Prev);
-  else
-    F->Offset = 0;
-  F->IsBeingLaidOut = false;
-  LastValidFragment[F->getParent()] = F;
-
+void MCAsmLayout::layoutBundle(MCFragment *F) {
   // If bundling is enabled and this fragment has instructions in it, it has to
   // obey the bundling restrictions. With padding, we'll have:
   //
@@ -454,21 +431,40 @@ void MCAsmLayout::layoutFragment(MCFragment *F) {
   // within-fragment padding (which would produce less padding when N is less
   // than the bundle size), but for now we don't.
   //
-  if (Assembler.isBundlingEnabled() && F->hasInstructions()) {
-    assert(isa<MCEncodedFragment>(F) &&
-           "Only MCEncodedFragment implementations have instructions");
-    MCEncodedFragment *EF = cast<MCEncodedFragment>(F);
-    uint64_t FSize = Assembler.computeFragmentSize(*this, *EF);
-
-    if (!Assembler.getRelaxAll() && FSize > Assembler.getBundleAlignSize())
-      report_fatal_error("Fragment can't be larger than a bundle size");
-
-    uint64_t RequiredBundlePadding =
-        computeBundlePadding(Assembler, EF, EF->Offset, FSize);
-    if (RequiredBundlePadding > UINT8_MAX)
-      report_fatal_error("Padding cannot exceed 255 bytes");
-    EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
-    EF->Offset += RequiredBundlePadding;
+  assert(isa<MCEncodedFragment>(F) &&
+         "Only MCEncodedFragment implementations have instructions");
+  MCEncodedFragment *EF = cast<MCEncodedFragment>(F);
+  uint64_t FSize = Assembler.computeFragmentSize(*this, *EF);
+
+  if (!Assembler.getRelaxAll() && FSize > Assembler.getBundleAlignSize())
+    report_fatal_error("Fragment can't be larger than a bundle size");
+
+  uint64_t RequiredBundlePadding =
+      computeBundlePadding(Assembler, EF, EF->Offset, FSize);
+  if (RequiredBundlePadding > UINT8_MAX)
+    report_fatal_error("Padding cannot exceed 255 bytes");
+  EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
+  EF->Offset += RequiredBundlePadding;
+}
+
+uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
+  ensureValid(F);
+  return F->Offset;
+}
+
+void MCAsmLayout::ensureValid(const MCFragment *Frag) const {
+  MCSection &Sec = *Frag->getParent();
+  if (Sec.hasLayout())
+    return;
+  Sec.setHasLayout(true);
+  uint64_t Offset = 0;
+  for (MCFragment &F : Sec) {
+    F.Offset = Offset;
+    if (Assembler.isBundlingEnabled() && F.hasInstructions()) {
+      const_cast<MCAsmLayout *>(this)->layoutBundle(&F);
+      Offset = F.Offset;
+    }
+    Offset += getAssembler().computeFragmentSize(*this, F);
   }
 }
 
@@ -848,7 +844,7 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
     // another. If any fragment has changed size, we have to re-layout (and
     // as a result possibly further relax) all.
     for (MCSection &Sec : *this)
-      Layout.invalidateFragmentsFrom(&*Sec.begin());
+      Sec.setHasLayout(false);
   }
 
   DEBUG_WITH_TYPE("mc-dump", {
@@ -1098,9 +1094,11 @@ bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
 
   uint64_t AlignedOffset = Layout.getFragmentOffset(&BF);
   uint64_t AlignedSize = 0;
-  for (const MCFragment *F = BF.getLastFragment(); F != &BF;
-       F = F->getPrevNode())
+  for (const MCFragment *F = BF.getNextNode();; F = F->getNextNode()) {
     AlignedSize += computeFragmentSize(Layout, *F);
+    if (F == BF.getLastFragment())
+      break;
+  }
 
   Align BoundaryAlignment = BF.getAlignment();
   uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment)
@@ -1109,7 +1107,6 @@ bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
   if (NewSize == BF.getSize())
     return false;
   BF.setSize(NewSize);
-  Layout.invalidateFragmentsFrom(&BF);
   return true;
 }
 
@@ -1219,47 +1216,19 @@ bool MCAssembler::relaxFragment(MCAsmLayout &Layout, MCFragment &F) {
   }
 }
 
-bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) {
-  // Holds the first fragment which needed relaxing during this layout. It will
-  // remain NULL if none were relaxed.
-  // When a fragment is relaxed, all the fragments following it should get
-  // invalidated because their offset is going to change.
-  MCFragment *FirstRelaxedFragment = nullptr;
-
-  // Attempt to relax all the fragments in the section.
-  for (MCFragment &Frag : Sec) {
-    // Check if this is a fragment that needs relaxation.
-    bool RelaxedFrag = relaxFragment(Layout, Frag);
-    if (RelaxedFrag && !FirstRelaxedFragment)
-      FirstRelaxedFragment = &Frag;
-  }
-  if (FirstRelaxedFragment) {
-    Layout.invalidateFragmentsFrom(FirstRelaxedFragment);
-    return true;
-  }
-  return false;
-}
-
 bool MCAssembler::layoutOnce(MCAsmLayout &Layout) {
   ++stats::RelaxationSteps;
 
-  bool WasRelaxed = false;
-  for (MCSection &Sec : *this) {
-    while (layoutSectionOnce(Layout, Sec))
-      WasRelaxed = true;
-  }
-
-  return WasRelaxed;
+  bool Changed = false;
+  for (MCSection &Sec : *this)
+    for (MCFragment &Frag : Sec)
+      if (relaxFragment(Layout, Frag))
+        Changed = true;
+  return Changed;
 }
 
 void MCAssembler::finishLayout(MCAsmLayout &Layout) {
   assert(getBackendPtr() && "Expected assembler backend");
-  // The layout is done. Mark every fragment as valid.
-  for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
-    MCSection &Section = *Layout.getSectionOrder()[i];
-    Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
-    computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
-  }
   getBackend().finishLayout(*this, Layout);
 }
 
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index f027eb6..771ca9c 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -498,7 +498,7 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
                    R, LinkedToSym);
 
   auto *F = new MCDataFragment();
-  Ret->getFragmentList().insert(Ret->begin(), F);
+  Ret->addFragment(*F);
   F->setParent(Ret);
   R->setFragment(F);
 
@@ -772,7 +772,7 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
   Entry.second = Result;
 
   auto *F = new MCDataFragment();
-  Result->getFragmentList().insert(Result->begin(), F);
+  Result->addFragment(*F);
   F->setParent(Result);
   Begin->setFragment(F);
 
@@ -838,7 +838,7 @@ MCSectionXCOFF *MCContext::getXCOFFSection(
   Entry.second = Result;
 
   auto *F = new MCDataFragment();
-  Result->getFragmentList().insert(Result->begin(), F);
+  Result->addFragment(*F);
   F->setParent(Result);
 
   if (Begin)
@@ -861,7 +861,7 @@ MCSectionSPIRV *MCContext::getSPIRVSection() {
       MCSectionSPIRV(SectionKind::getText(), Begin);
 
   auto *F = new MCDataFragment();
-  Result->getFragmentList().insert(Result->begin(), F);
+  Result->addFragment(*F);
   F->setParent(Result);
 
   return Result;
@@ -884,7 +884,7 @@ MCSectionDXContainer *MCContext::getDXContainerSection(StringRef Section,
 
   // The first fragment will store the header
   auto *F = new MCDataFragment();
-  MapIt->second->getFragmentList().insert(MapIt->second->begin(), F);
+  MapIt->second->addFragment(*F);
   F->setParent(MapIt->second);
 
   return MapIt->second;
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index b065d03..b70ac86 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -645,10 +645,6 @@ static void AttemptToFoldSymbolOffsetDifference(
       Addend += SA.getOffset() - SB.getOffset();
       return FinalizeFolding();
     }
-    // One of the symbol involved is part of a fragment being laid out. Quit now
-    // to avoid a self loop.
-    if (!Layout->canGetFragmentOffset(FA) || !Layout->canGetFragmentOffset(FB))
-      return;
 
     // Eagerly evaluate when layout is finalized.
     Addend += Layout->getSymbolOffset(A->getSymbol()) -
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index a8da46d..84a5871 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -39,64 +39,8 @@ MCAsmLayout::MCAsmLayout(MCAssembler &Asm) : Assembler(Asm) {
       SectionOrder.push_back(&Sec);
 }
 
-bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
-  const MCSection *Sec = F->getParent();
-  const MCFragment *LastValid = LastValidFragment.lookup(Sec);
-  if (!LastValid)
-    return false;
-  assert(LastValid->getParent() == Sec);
-  return F->getLayoutOrder() <= LastValid->getLayoutOrder();
-}
-
-bool MCAsmLayout::canGetFragmentOffset(const MCFragment *F) const {
-  MCSection *Sec = F->getParent();
-  MCSection::iterator I;
-  if (MCFragment *LastValid = LastValidFragment[Sec]) {
-    // Fragment already valid, offset is available.
-    if (F->getLayoutOrder() <= LastValid->getLayoutOrder())
-      return true;
-    I = ++MCSection::iterator(LastValid);
-  } else
-    I = Sec->begin();
-
-  // A fragment ordered before F is currently being laid out.
-  const MCFragment *FirstInvalidFragment = &*I;
-  if (FirstInvalidFragment->IsBeingLaidOut)
-    return false;
-
-  return true;
-}
-
 void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
-  // If this fragment wasn't already valid, we don't need to do anything.
-  if (!isFragmentValid(F))
-    return;
-
-  // Otherwise, reset the last valid fragment to the previous fragment
-  // (if this is the first fragment, it will be NULL).
-  LastValidFragment[F->getParent()] = F->getPrevNode();
-}
-
-void MCAsmLayout::ensureValid(const MCFragment *F) const {
-  MCSection *Sec = F->getParent();
-  MCSection::iterator I;
-  if (MCFragment *Cur = LastValidFragment[Sec])
-    I = ++MCSection::iterator(Cur);
-  else
-    I = Sec->begin();
-
-  // Advance the layout position until the fragment is valid.
-  while (!isFragmentValid(F)) {
-    assert(I != Sec->end() && "Layout bookkeeping error");
-    const_cast<MCAsmLayout *>(this)->layoutFragment(&*I);
-    ++I;
-  }
-}
-
-uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
-  ensureValid(F);
-  assert(F->Offset != ~UINT64_C(0) && "Address not set!");
-  return F->Offset;
+  F->getParent()->setHasLayout(false);
 }
 
 // Simple getSymbolOffset helper for the non-variable case.
@@ -258,9 +202,9 @@ void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
                        MCSection *Parent)
     : Parent(Parent), Atom(nullptr), Offset(~UINT64_C(0)), LayoutOrder(0),
-      Kind(Kind), IsBeingLaidOut(false), HasInstructions(HasInstructions) {
+      Kind(Kind), HasInstructions(HasInstructions) {
   if (Parent && !isa<MCDummyFragment>(*this))
-    Parent->getFragmentList().push_back(this);
+    Parent->addFragment(*this);
 }
 
 void MCFragment::destroy() {
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index ea3baf9..12e69f7 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -23,8 +23,8 @@ using namespace llvm;
 MCSection::MCSection(SectionVariant V, StringRef Name, SectionKind K,
                      MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
-      IsRegistered(false), DummyFragment(this), Name(Name), Variant(V),
-      Kind(K) {}
+      HasLayout(false), IsRegistered(false), DummyFragment(this), Name(Name),
+      Variant(V), Kind(K) {}
 
 MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
   if (!End)
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 7758363..e18ce5d3 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -91,12 +91,14 @@ static Error initializeReader(InstrProfReader &Reader) {
 /// associated endian format to read the binary ids correctly.
 static Error
 readBinaryIdsInternal(const MemoryBuffer &DataBuffer,
-                      const uint64_t BinaryIdsSize,
-                      const uint8_t *BinaryIdsStart,
+                      ArrayRef<uint8_t> BinaryIdsBuffer,
                       std::vector<llvm::object::BuildID> &BinaryIds,
                       const llvm::endianness Endian) {
   using namespace support;
 
+  const uint64_t BinaryIdsSize = BinaryIdsBuffer.size();
+  const uint8_t *BinaryIdsStart = BinaryIdsBuffer.data();
+
   if (BinaryIdsSize == 0)
     return Error::success();
 
@@ -113,12 +115,7 @@ readBinaryIdsInternal(const MemoryBuffer &DataBuffer,
           instrprof_error::malformed,
           "not enough data to read binary id length");
 
-    uint64_t BILen = 0;
-    if (Endian == llvm::endianness::little)
-      BILen = endian::readNext<uint64_t, llvm::endianness::little>(BI);
-    else
-      BILen = endian::readNext<uint64_t, llvm::endianness::big>(BI);
-
+    uint64_t BILen = endian::readNext<uint64_t>(BI, Endian);
     if (BILen == 0)
       return make_error<InstrProfError>(instrprof_error::malformed,
                                         "binary id length is 0");
@@ -143,13 +140,12 @@ readBinaryIdsInternal(const MemoryBuffer &DataBuffer,
   return Error::success();
 }
 
-static void
-printBinaryIdsInternal(raw_ostream &OS,
-                       std::vector<llvm::object::BuildID> &BinaryIds) {
+static void printBinaryIdsInternal(raw_ostream &OS,
+                                   ArrayRef<llvm::object::BuildID> BinaryIds) {
   OS << "Binary IDs: \n";
-  for (auto BI : BinaryIds) {
-    for (uint64_t I = 0; I < BI.size(); I++)
-      OS << format("%02x", BI[I]);
+  for (const auto &BI : BinaryIds) {
+    for (auto I : BI)
+      OS << format("%02x", I);
     OS << "\n";
   }
 }
@@ -590,10 +586,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd();
   if (BinaryIdSize % sizeof(uint64_t) || BinaryIdEnd > BufferEnd)
     return error(instrprof_error::bad_header);
-  if (BinaryIdSize != 0) {
-    if (Error Err =
-            readBinaryIdsInternal(*DataBuffer, BinaryIdSize, BinaryIdStart,
-                                  BinaryIds, getDataEndianness()))
+  ArrayRef<uint8_t> BinaryIdsBuffer(BinaryIdStart, BinaryIdSize);
+  if (!BinaryIdsBuffer.empty()) {
+    if (Error Err = readBinaryIdsInternal(*DataBuffer, BinaryIdsBuffer,
+                                          BinaryIds, getDataEndianness()))
       return Err;
   }
 
@@ -1389,13 +1385,13 @@ Error IndexedInstrProfReader::readHeader() {
   if (Header->getIndexedProfileVersion() >= 9) {
     const unsigned char *Ptr = Start + Header->BinaryIdOffset;
     // Read binary ids size.
-    BinaryIdsSize =
+    uint64_t BinaryIdsSize =
         support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     if (BinaryIdsSize % sizeof(uint64_t))
       return error(instrprof_error::bad_header);
     // Set the binary ids start.
-    BinaryIdsStart = Ptr;
-    if (BinaryIdsStart > (const unsigned char *)DataBuffer->getBufferEnd())
+    BinaryIdsBuffer = ArrayRef<uint8_t>(Ptr, BinaryIdsSize);
+    if (Ptr > (const unsigned char *)DataBuffer->getBufferEnd())
       return make_error<InstrProfError>(instrprof_error::malformed,
                                         "corrupted binary ids");
   }
@@ -1403,14 +1399,16 @@ Error IndexedInstrProfReader::readHeader() {
   if (Header->getIndexedProfileVersion() >= 12) {
     const unsigned char *Ptr = Start + Header->VTableNamesOffset;
 
-    CompressedVTableNamesLen =
+    uint64_t CompressedVTableNamesLen =
         support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
     // Writer first writes the length of compressed string, and then the actual
     // content.
-    VTableNamePtr = (const char *)Ptr;
+    const char *VTableNamePtr = (const char *)Ptr;
     if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd())
       return make_error<InstrProfError>(instrprof_error::truncated);
+
+    VTableName = StringRef(VTableNamePtr, CompressedVTableNamesLen);
   }
 
   if (Header->getIndexedProfileVersion() >= 10 &&
@@ -1466,8 +1464,7 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
 
   auto NewSymtab = std::make_unique<InstrProfSymtab>();
 
-  if (Error E = NewSymtab->initVTableNamesFromCompressedStrings(
-          StringRef(VTableNamePtr, CompressedVTableNamesLen))) {
+  if (Error E = NewSymtab->initVTableNamesFromCompressedStrings(VTableName)) {
     auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
     consumeError(error(ErrCode, Msg));
   }
@@ -1507,7 +1504,7 @@ Expected<InstrProfRecord> IndexedInstrProfReader::getInstrProfRecord(
   // A flag to indicate if the records are from the same type
   // of profile (i.e cs vs nocs).
   bool CSBitMatch = false;
-  auto getFuncSum = [](const std::vector<uint64_t> &Counts) {
+  auto getFuncSum = [](ArrayRef<uint64_t> Counts) {
     uint64_t ValueSum = 0;
     for (uint64_t CountValue : Counts) {
       if (CountValue == (uint64_t)-1)
@@ -1696,8 +1693,8 @@ Error IndexedInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
 
 Error IndexedInstrProfReader::readBinaryIds(
     std::vector<llvm::object::BuildID> &BinaryIds) {
-  return readBinaryIdsInternal(*DataBuffer, BinaryIdsSize, BinaryIdsStart,
-                               BinaryIds, llvm::endianness::little);
+  return readBinaryIdsInternal(*DataBuffer, BinaryIdsBuffer, BinaryIds,
+                               llvm::endianness::little);
 }
 
 Error IndexedInstrProfReader::printBinaryIds(raw_ostream &OS) {
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 7d7c980..1a9add1 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -494,17 +494,40 @@ static uint64_t writeMemProfFrames(
 static llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
 writeMemProfFrameArray(
     ProfOStream &OS,
-    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
+    llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
   // Mappings from FrameIds to array indexes.
   llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes;
 
-  // Sort the FrameIDs for stability.
+  // Compute the order in which we serialize Frames.  The order does not matter
+  // in terms of correctness, but we still compute it for deserialization
+  // performance.  Specifically, if we serialize frequently used Frames one
+  // after another, we have better cache utilization.  For two Frames that
+  // appear equally frequently, we break a tie by serializing the one that tends
+  // to appear earlier in call stacks.  We implement the tie-breaking mechanism
+  // by computing the sum of indexes within call stacks for each Frame.  If we
+  // still have a tie, then we just resort to compare two FrameIds, which is
+  // just for stability of output.
   std::vector<std::pair<memprof::FrameId, const memprof::Frame *>> FrameIdOrder;
   FrameIdOrder.reserve(MemProfFrameData.size());
   for (const auto &[Id, Frame] : MemProfFrameData)
     FrameIdOrder.emplace_back(Id, &Frame);
   assert(MemProfFrameData.size() == FrameIdOrder.size());
-  llvm::sort(FrameIdOrder);
+  llvm::sort(FrameIdOrder,
+             [&](const std::pair<memprof::FrameId, const memprof::Frame *> &L,
+                 const std::pair<memprof::FrameId, const memprof::Frame *> &R) {
+               const auto &SL = FrameHistogram[L.first];
+               const auto &SR = FrameHistogram[R.first];
+               // Popular FrameIds should come first.
+               if (SL.Count != SR.Count)
+                 return SL.Count > SR.Count;
+               // If they are equally popular, then the one that tends to appear
+               // earlier in call stacks should come first.
+               if (SL.PositionSum != SR.PositionSum)
+                 return SL.PositionSum < SR.PositionSum;
+               // Compare their FrameIds for sort stability.
+               return L.first < R.first;
+             });
 
   // Serialize all frames while creating mappings from linear IDs to FrameIds.
   uint64_t Index = 0;
@@ -543,12 +566,14 @@ writeMemProfCallStackArray(
     llvm::MapVector<memprof::CallStackId, llvm::SmallVector<memprof::FrameId>>
         &MemProfCallStackData,
     llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
-        &MemProfFrameIndexes) {
+        &MemProfFrameIndexes,
+    llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes;
 
   memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   for (auto I : Builder.getRadixArray())
     OS.write32(I);
   MemProfCallStackIndexes = Builder.takeCallStackPos();
@@ -704,13 +729,17 @@ static Error writeMemProfV3(ProfOStream &OS,
     Schema = memprof::getFullSchema();
   writeMemProfSchema(OS, Schema);
 
+  llvm::DenseMap<memprof::FrameId, memprof::FrameStat> FrameHistogram =
+      memprof::computeFrameHistogram(MemProfData.CallStackData);
+  assert(MemProfData.FrameData.size() == FrameHistogram.size());
+
   llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes =
-      writeMemProfFrameArray(OS, MemProfData.FrameData);
+      writeMemProfFrameArray(OS, MemProfData.FrameData, FrameHistogram);
 
   uint64_t CallStackPayloadOffset = OS.tell();
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes = writeMemProfCallStackArray(
-          OS, MemProfData.CallStackData, MemProfFrameIndexes);
+          OS, MemProfData.CallStackData, MemProfFrameIndexes, FrameHistogram);
 
   uint64_t RecordPayloadOffset = OS.tell();
   uint64_t RecordTableOffset =
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 620e2e2..4ca8687 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -343,15 +343,15 @@ MemProfRecord IndexedMemProfRecord::toMemProfRecord(
   MemProfRecord Record;
 
   Record.AllocSites.reserve(AllocSites.size());
-  for (const memprof::IndexedAllocationInfo &IndexedAI : AllocSites) {
-    memprof::AllocationInfo AI;
+  for (const IndexedAllocationInfo &IndexedAI : AllocSites) {
+    AllocationInfo AI;
     AI.Info = IndexedAI.Info;
     AI.CallStack = Callback(IndexedAI.CSId);
     Record.AllocSites.push_back(std::move(AI));
   }
 
   Record.CallSites.reserve(CallSiteIds.size());
-  for (memprof::CallStackId CSId : CallSiteIds)
+  for (CallStackId CSId : CallSiteIds)
     Record.CallSites.push_back(Callback(CSId));
 
   return Record;
@@ -486,7 +486,8 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
 void CallStackRadixTreeBuilder::build(
     llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
         &&MemProfCallStackData,
-    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
+    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
+    llvm::DenseMap<FrameId, FrameStat> &FrameHistogram) {
   // Take the vector portion of MemProfCallStackData.  The vector is exactly
   // what we need to sort.  Also, we no longer need its lookup capability.
   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
@@ -498,14 +499,56 @@ void CallStackRadixTreeBuilder::build(
     return;
   }
 
-  // Sort the list of call stacks in the dictionary order to maximize the length
-  // of the common prefix between two adjacent call stacks.
+  // Sorting the list of call stacks in the dictionary order is sufficient to
+  // maximize the length of the common prefix between two adjacent call stacks
+  // and thus minimize the length of RadixArray.  However, we go one step
+  // further and try to reduce the number of times we follow pointers to parents
+  // during deserilization.  Consider a poorly encoded radix tree:
+  //
+  // CallStackId 1:  f1 -> f2 -> f3
+  //                  |
+  // CallStackId 2:   +--- f4 -> f5
+  //                        |
+  // CallStackId 3:         +--> f6
+  //
+  // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
+  // Once we encode CallStackId 1 into RadixArray, every other call stack with
+  // common prefix f1 ends up pointing to CallStackId 1.  Since CallStackId 3
+  // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
+  // parents twice.
+  //
+  // We try to alleviate the situation by sorting the list of call stacks by
+  // comparing the popularity of frames rather than the integer values of
+  // FrameIds.  In the example above, f4 is more popular than f2, so we sort the
+  // call stacks and encode them as:
+  //
+  // CallStackId 2:  f1 -- f4 -> f5
+  //                  |     |
+  // CallStackId 3:   |     +--> f6
+  //                  |
+  // CallStackId 1:   +--> f2 -> f3
+  //
+  // Notice that CallStackId 3 follows a pointer to a parent only once.
+  //
+  // All this is a quick-n-dirty trick to reduce the number of jumps.  The
+  // proper way would be to compute the weight of each radix tree node -- how
+  // many call stacks use a given radix tree node, and encode a radix tree from
+  // the heaviest node first.  We do not do so because that's a lot of work.
   llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
     // Call stacks are stored from leaf to root.  Perform comparisons from the
     // root.
     return std::lexicographical_compare(
         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
-        [&](FrameId F1, FrameId F2) { return F1 < F2; });
+        [&](FrameId F1, FrameId F2) {
+          uint64_t H1 = FrameHistogram[F1].Count;
+          uint64_t H2 = FrameHistogram[F2].Count;
+          // Popular frames should come later because we encode call stacks from
+          // the last one in the list.
+          if (H1 != H2)
+            return H1 < H2;
+          // For sort stability.
+          return F1 < F2;
+        });
   });
 
   // Reserve some reasonable amount of storage.
@@ -569,6 +612,22 @@ void CallStackRadixTreeBuilder::build(
     V = RadixArray.size() - 1 - V;
 }
 
+llvm::DenseMap<FrameId, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+                          &MemProfCallStackData) {
+  llvm::DenseMap<FrameId, FrameStat> Histogram;
+
+  for (const auto &KV : MemProfCallStackData) {
+    const auto &CS = KV.second;
+    for (unsigned I = 0, E = CS.size(); I != E; ++I) {
+      auto &S = Histogram[CS[I]];
+      ++S.Count;
+      S.PositionSum += I;
+    }
+  }
+  return Histogram;
+}
+
 void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
   for (const auto &AS : Record.AllocSites) {
     assert(AS.CSId == hashCallStack(AS.CallStack));
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 625e523..1630fef 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -811,8 +811,7 @@ std::error_code SampleProfileWriterBinary::writeSummary() {
   encodeULEB128(Summary->getMaxFunctionCount(), OS);
   encodeULEB128(Summary->getNumCounts(), OS);
   encodeULEB128(Summary->getNumFunctions(), OS);
-  const std::vector<ProfileSummaryEntry> &Entries =
-      Summary->getDetailedSummary();
+  ArrayRef<ProfileSummaryEntry> Entries = Summary->getDetailedSummary();
   encodeULEB128(Entries.size(), OS);
   for (auto Entry : Entries) {
     encodeULEB128(Entry.Cutoff, OS);
diff --git a/llvm/lib/Support/CodeGenCoverage.cpp b/llvm/lib/Support/CodeGenCoverage.cpp
index 4d41c42..2e35019 100644
--- a/llvm/lib/Support/CodeGenCoverage.cpp
+++ b/llvm/lib/Support/CodeGenCoverage.cpp
@@ -21,8 +21,6 @@
 
 using namespace llvm;
 
-static sys::SmartMutex<true> OutputMutex;
-
 CodeGenCoverage::CodeGenCoverage() = default;
 
 void CodeGenCoverage::setCovered(uint64_t RuleID) {
@@ -79,6 +77,7 @@ bool CodeGenCoverage::parse(MemoryBuffer &Buffer, StringRef BackendName) {
 bool CodeGenCoverage::emit(StringRef CoveragePrefix,
                            StringRef BackendName) const {
   if (!CoveragePrefix.empty() && !RuleCoverage.empty()) {
+    static sys::SmartMutex<true> OutputMutex;
     sys::SmartScopedLock<true> Lock(OutputMutex);
 
     // We can handle locking within a process easily enough but we don't want to
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48bf648..87d737d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17720,6 +17720,47 @@ static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
 }
 
+// Transform vector add(zext i8 to i32, zext i8 to i32)
+//  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
+// This allows extra uses of saddl/uaddl at the lower vector widths, and less
+// extends.
+static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
+      (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+       N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
+      (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+       N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
+      N->getOperand(0).getOperand(0).getValueType() !=
+          N->getOperand(1).getOperand(0).getValueType())
+    return SDValue();
+
+  if (N->getOpcode() == ISD::MUL &&
+      N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0).getOperand(0);
+  SDValue N1 = N->getOperand(1).getOperand(0);
+  EVT InVT = N0.getValueType();
+
+  EVT S1 = InVT.getScalarType();
+  EVT S2 = VT.getScalarType();
+  if ((S2 == MVT::i32 && S1 == MVT::i8) ||
+      (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
+    SDLoc DL(N);
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+                                  S2.getHalfSizedIntegerVT(*DAG.getContext()),
+                                  VT.getVectorElementCount());
+    SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
+    SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
+    SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
+    return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
+                                                  : (unsigned)ISD::SIGN_EXTEND,
+                       DL, VT, NewOp);
+  }
+  return SDValue();
+}
+
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
@@ -17728,6 +17769,8 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     return Ext;
   if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
     return Ext;
+  if (SDValue Ext = performVectorExtCombine(N, DAG))
+    return Ext;
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
@@ -19604,41 +19647,6 @@ static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
 }
 
-// Transform vector add(zext i8 to i32, zext i8 to i32)
-//  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
-// This allows extra uses of saddl/uaddl at the lower vector widths, and less
-// extends.
-static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
-      (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
-       N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
-      (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
-       N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
-      N->getOperand(0).getOperand(0).getValueType() !=
-          N->getOperand(1).getOperand(0).getValueType())
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0).getOperand(0);
-  SDValue N1 = N->getOperand(1).getOperand(0);
-  EVT InVT = N0.getValueType();
-
-  EVT S1 = InVT.getScalarType();
-  EVT S2 = VT.getScalarType();
-  if ((S2 == MVT::i32 && S1 == MVT::i8) ||
-      (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
-    SDLoc DL(N);
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
-                                  S2.getHalfSizedIntegerVT(*DAG.getContext()),
-                                  VT.getVectorElementCount());
-    SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
-    SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
-    SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
-  }
-  return SDValue();
-}
-
 static SDValue performBuildVectorCombine(SDNode *N,
                                          TargetLowering::DAGCombinerInfo &DCI,
                                          SelectionDAG &DAG) {
@@ -20260,7 +20268,7 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
     return Val;
-  if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG))
+  if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
     return Val;
   if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG))
     return Val;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index a759efc..cc33765 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -741,11 +741,12 @@ def ProcessorFeatures {
                                      FeatureNEON, FeaturePerfMon, FeatureFullFP16,
                                      FeatureFP16FML, FeatureSHA3];
   list<SubtargetFeature> AppleA14 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
-                                     FeatureNEON, FeaturePerfMon, FeatureFRInt3264,
-                                     FeatureSpecRestrict, FeatureSSBS, FeatureSB,
-                                     FeaturePredRes, FeatureCacheDeepPersist,
+                                     FeatureNEON, FeaturePerfMon,
                                      FeatureFullFP16, FeatureFP16FML, FeatureSHA3,
-                                     FeatureAltFPCmp];
+                                     // ArmV8.5-a extensions, excluding BTI:
+                                     FeatureAltFPCmp, FeatureFRInt3264,
+                                     FeatureSpecRestrict, FeatureSSBS, FeatureSB,
+                                     FeaturePredRes, FeatureCacheDeepPersist];
   list<SubtargetFeature> AppleA15 = [HasV8_6aOps, FeatureCrypto, FeatureFPARMv8,
                                      FeatureNEON, FeaturePerfMon, FeatureSHA3,
                                      FeatureFullFP16, FeatureFP16FML];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cad4a34..609e975 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -400,6 +400,36 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
                               false);
 }
 
+SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
+  SmallString<128> Str;
+  raw_svector_ostream OSS(Str);
+  int64_t IVal;
+  if (Value->evaluateAsAbsolute(IVal)) {
+    OSS << static_cast<uint64_t>(IVal);
+  } else {
+    Value->print(OSS, MAI);
+  }
+  return Str;
+}
+
+void AMDGPUAsmPrinter::emitCommonFunctionComments(
+    const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
+    const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
+    const AMDGPUMachineFunction *MFI) {
+  OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
+  OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
+  OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
+  if (NumAGPR && TotalNumVGPR) {
+    OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
+    OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
+                                false);
+  }
+  OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
+                              false);
+  OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+                              false);
+}
+
 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
     const MachineFunction &MF) const {
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -554,13 +584,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
     OutStreamer->emitRawComment(" Kernel info:", false);
     emitCommonFunctionComments(
-        getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx),
-        STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)
-                          : std::optional<uint32_t>(),
-        getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx),
-        getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx),
-        getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx),
-        getFunctionCodeSize(MF), MFI);
+        CurrentProgramInfo.NumArchVGPR,
+        STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
+        CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
+        CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
 
     OutStreamer->emitRawComment(
       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -571,43 +598,38 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       " bytes/workgroup (compile time only)", false);
 
     OutStreamer->emitRawComment(
-        " SGPRBlocks: " +
-            Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)),
-        false);
+        " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
+
     OutStreamer->emitRawComment(
-        " VGPRBlocks: " +
-            Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)),
-        false);
+        " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
 
     OutStreamer->emitRawComment(
         " NumSGPRsForWavesPerEU: " +
-            Twine(
-                getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)),
+            getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
         false);
     OutStreamer->emitRawComment(
         " NumVGPRsForWavesPerEU: " +
-            Twine(
-                getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)),
+            getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
         false);
 
-    if (STM.hasGFX90AInsts())
+    if (STM.hasGFX90AInsts()) {
+      const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
+          CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
+      AdjustedAccum = MCBinaryExpr::createMul(
+          AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
       OutStreamer->emitRawComment(
-          " AccumOffset: " +
-              Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) *
-                    4),
-          false);
+          " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
+    }
 
     OutStreamer->emitRawComment(
-        " Occupancy: " +
-            Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)),
-        false);
+        " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
 
     OutStreamer->emitRawComment(
       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
 
     OutStreamer->emitRawComment(
         " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
-            Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)),
+            getMCExprStr(CurrentProgramInfo.ScratchEnable),
         false);
     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                     Twine(CurrentProgramInfo.UserSGPR),
@@ -628,20 +650,25 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                     Twine(CurrentProgramInfo.TIdIGCompCount),
                                 false);
 
+    [[maybe_unused]] int64_t PGMRSrc3;
     assert(STM.hasGFX90AInsts() ||
-           getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+           (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
+                PGMRSrc3) &&
+            static_cast<uint64_t>(PGMRSrc3) == 0));
     if (STM.hasGFX90AInsts()) {
       OutStreamer->emitRawComment(
           " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
-              Twine((AMDHSA_BITS_GET(
-                  getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
-                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+              getMCExprStr(MCKernelDescriptor::bits_get(
+                  CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
           false);
       OutStreamer->emitRawComment(
           " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
-              Twine((AMDHSA_BITS_GET(
-                  getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
-                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+              getMCExprStr(MCKernelDescriptor::bits_get(
+                  CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
           false);
     }
   }
@@ -1463,28 +1490,26 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
   // remarks to simulate newlines. If and when clang does accept newlines, this
   // formatting should be aggregated into one remark with newlines to avoid
   // printing multiple diagnostic location and diag opts.
-  MCContext &MCCtx = MF.getContext();
   EmitResourceUsageRemark("FunctionName", "Function Name",
                           MF.getFunction().getName());
   EmitResourceUsageRemark("NumSGPR", "SGPRs",
-                          getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx));
-  EmitResourceUsageRemark(
-      "NumVGPR", "VGPRs",
-      getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx));
+                          getMCExprStr(CurrentProgramInfo.NumSGPR));
+  EmitResourceUsageRemark("NumVGPR", "VGPRs",
+                          getMCExprStr(CurrentProgramInfo.NumArchVGPR));
   if (hasMAIInsts) {
-    EmitResourceUsageRemark(
-        "NumAGPR", "AGPRs",
-        getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx));
+    EmitResourceUsageRemark("NumAGPR", "AGPRs",
+                            getMCExprStr(CurrentProgramInfo.NumAccVGPR));
   }
-  EmitResourceUsageRemark(
-      "ScratchSize", "ScratchSize [bytes/lane]",
-      getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx));
+  EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
+                          getMCExprStr(CurrentProgramInfo.ScratchSize));
+  int64_t DynStack;
+  bool DynStackEvaluatable =
+      CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
   StringRef DynamicStackStr =
-      getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True"
-                                                                 : "False";
+      DynStackEvaluatable && DynStack ? "True" : "False";
   EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
   EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
-                          getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx));
+                          getMCExprStr(CurrentProgramInfo.Occupancy));
   EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
                           CurrentProgramInfo.SGPRSpill);
   EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 87156f2..162cd40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -65,6 +65,11 @@ private:
                                   uint32_t TotalNumVGPR, uint32_t NumSGPR,
                                   uint64_t ScratchSize, uint64_t CodeSize,
                                   const AMDGPUMachineFunction *MFI);
+  void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
+                                  const MCExpr *TotalNumVGPR,
+                                  const MCExpr *NumSGPR,
+                                  const MCExpr *ScratchSize, uint64_t CodeSize,
+                                  const AMDGPUMachineFunction *MFI);
   void emitResourceUsageRemarks(const MachineFunction &MF,
                                 const SIProgramInfo &CurrentProgramInfo,
                                 bool isModuleEntryFunction, bool hasMAIInsts);
@@ -79,6 +84,7 @@ private:
   void initTargetStreamer(Module &M);
 
   static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx);
+  SmallString<128> getMCExprStr(const MCExpr *Value);
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 1d64500..38cc5a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -249,63 +249,54 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
   switch (I.getIntrinsicID()) {
   default:
     return;
-  case Intrinsic::amdgcn_buffer_atomic_add:
   case Intrinsic::amdgcn_struct_buffer_atomic_add:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
   case Intrinsic::amdgcn_raw_buffer_atomic_add:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
     Op = AtomicRMWInst::Add;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_sub:
   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
     Op = AtomicRMWInst::Sub;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_and:
   case Intrinsic::amdgcn_struct_buffer_atomic_and:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
   case Intrinsic::amdgcn_raw_buffer_atomic_and:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
     Op = AtomicRMWInst::And;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_or:
   case Intrinsic::amdgcn_struct_buffer_atomic_or:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
   case Intrinsic::amdgcn_raw_buffer_atomic_or:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
     Op = AtomicRMWInst::Or;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_xor:
   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
     Op = AtomicRMWInst::Xor;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_smin:
   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
     Op = AtomicRMWInst::Min;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_umin:
   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
     Op = AtomicRMWInst::UMin;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_smax:
   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
     Op = AtomicRMWInst::Max;
     break;
-  case Intrinsic::amdgcn_buffer_atomic_umax:
   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 375643b..18193d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -42,8 +42,10 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
   if (StoreSize <= 32)
     return EVT::getIntegerVT(Ctx, StoreSize);
 
-  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
-  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+  if (StoreSize % 32 == 0)
+    return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+
+  return VT;
 }
 
 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 160a175..93bca44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1158,12 +1158,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
     break;
   }
-  case Intrinsic::amdgcn_buffer_store_format:
   case Intrinsic::amdgcn_raw_buffer_store_format:
   case Intrinsic::amdgcn_struct_buffer_store_format:
   case Intrinsic::amdgcn_raw_tbuffer_store:
   case Intrinsic::amdgcn_struct_tbuffer_store:
-  case Intrinsic::amdgcn_tbuffer_store:
   case Intrinsic::amdgcn_image_store_1d:
   case Intrinsic::amdgcn_image_store_1darray:
   case Intrinsic::amdgcn_image_store_2d:
@@ -1376,8 +1374,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
     std::function<void(Instruction *, unsigned, APInt, APInt &)>
         SimplifyAndSetOp) const {
   switch (II.getIntrinsicID()) {
-  case Intrinsic::amdgcn_buffer_load:
-  case Intrinsic::amdgcn_buffer_load_format:
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
   case Intrinsic::amdgcn_raw_buffer_load_format:
@@ -1391,7 +1387,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
   case Intrinsic::amdgcn_struct_tbuffer_load:
   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
-  case Intrinsic::amdgcn_tbuffer_load:
     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
   default: {
     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 410dc83..e84d39a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -256,17 +256,6 @@ def : SourceOfDivergence<int_amdgcn_ds_fadd>;
 def : SourceOfDivergence<int_amdgcn_ds_fmin>;
 def : SourceOfDivergence<int_amdgcn_ds_fmax>;
 def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
@@ -339,7 +328,6 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
 def : SourceOfDivergence<int_amdgcn_live_mask>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 94ee4ac..0751c8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -560,10 +560,16 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   else
     ++MaxSize;
 
-  MDBuilder MDB(I->getContext());
-  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
-                                                  APInt(32, MaxSize));
-  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  APInt Lower{32, MinSize};
+  APInt Upper{32, MaxSize};
+  if (auto *CI = dyn_cast<CallBase>(I)) {
+    ConstantRange Range(Lower, Upper);
+    CI->addRangeRetAttr(Range);
+  } else {
+    MDBuilder MDB(I->getContext());
+    MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
+    I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  }
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d8667a..77b2af7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1280,19 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
     return true;
   }
-  case Intrinsic::amdgcn_buffer_atomic_fadd: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
-    Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
-    Info.align.reset();
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-
-    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
-    if (!Vol || !Vol->isZero())
-      Info.flags |= MachineMemOperand::MOVolatile;
-
-    return true;
-  }
   case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
   case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -8609,12 +8596,6 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
                                  M->getMemOperand());
 }
 
-// Return a value to use for the idxen operand by examining the vindex operand.
-static unsigned getIdxEn(SDValue VIndex) {
-  // No need to set idxen if vindex is known to be zero.
-  return isNullConstant(VIndex) ? 0 : 1;
-}
-
 SDValue
 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
                                                 unsigned NewOpcode) const {
@@ -8739,43 +8720,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
                                    M->getMemoryVT(), M->getMemOperand());
   }
-  case Intrinsic::amdgcn_buffer_load:
-  case Intrinsic::amdgcn_buffer_load_format: {
-    unsigned Glc = Op.getConstantOperandVal(5);
-    unsigned Slc = Op.getConstantOperandVal(6);
-    unsigned IdxEn = getIdxEn(Op.getOperand(3));
-    SDValue Ops[] = {
-      Op.getOperand(0), // Chain
-      Op.getOperand(2), // rsrc
-      Op.getOperand(3), // vindex
-      SDValue(),        // voffset -- will be set by setBufferOffsets
-      SDValue(),        // soffset -- will be set by setBufferOffsets
-      SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
-    };
-    setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
-
-    unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
-        AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
-    EVT VT = Op.getValueType();
-    EVT IntVT = VT.changeTypeToInteger();
-    auto *M = cast<MemSDNode>(Op);
-    EVT LoadVT = Op.getValueType();
-
-    if (LoadVT.getScalarType() == MVT::f16)
-      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
-                                 M, DAG, Ops);
-
-    // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
-    if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
-      return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
-                                        M->getMemOperand());
-
-    return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
-                               M->getMemOperand(), DAG);
-  }
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
   case Intrinsic::amdgcn_raw_buffer_load_format:
@@ -8825,35 +8769,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
   }
-  case Intrinsic::amdgcn_tbuffer_load: {
-    MemSDNode *M = cast<MemSDNode>(Op);
-    EVT LoadVT = Op.getValueType();
-
-    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
-    unsigned Dfmt = Op.getConstantOperandVal(7);
-    unsigned Nfmt = Op.getConstantOperandVal(8);
-    unsigned Glc = Op.getConstantOperandVal(9);
-    unsigned Slc = Op.getConstantOperandVal(10);
-    unsigned IdxEn = getIdxEn(Op.getOperand(3));
-    SDValue Ops[] = {
-        Op.getOperand(0),                                        // Chain
-        Op.getOperand(2),                                        // rsrc
-        Op.getOperand(3),                                        // vindex
-        Op.getOperand(4),                                        // voffset
-        SOffset,                                                 // soffset
-        Op.getOperand(6),                                        // offset
-        DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
-        DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32),   // cachepolicy
-        DAG.getTargetConstant(IdxEn, DL, MVT::i1)                // idxen
-    };
-
-    if (LoadVT.getScalarType() == MVT::f16)
-      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
-                                 M, DAG, Ops);
-    return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
-                               Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
-                               DAG);
-  }
   case Intrinsic::amdgcn_raw_tbuffer_load:
   case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
     MemSDNode *M = cast<MemSDNode>(Op);
@@ -8908,82 +8823,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
                                DAG);
   }
-  case Intrinsic::amdgcn_buffer_atomic_swap:
-  case Intrinsic::amdgcn_buffer_atomic_add:
-  case Intrinsic::amdgcn_buffer_atomic_sub:
-  case Intrinsic::amdgcn_buffer_atomic_csub:
-  case Intrinsic::amdgcn_buffer_atomic_smin:
-  case Intrinsic::amdgcn_buffer_atomic_umin:
-  case Intrinsic::amdgcn_buffer_atomic_smax:
-  case Intrinsic::amdgcn_buffer_atomic_umax:
-  case Intrinsic::amdgcn_buffer_atomic_and:
-  case Intrinsic::amdgcn_buffer_atomic_or:
-  case Intrinsic::amdgcn_buffer_atomic_xor:
-  case Intrinsic::amdgcn_buffer_atomic_fadd: {
-    unsigned Slc = Op.getConstantOperandVal(6);
-    unsigned IdxEn = getIdxEn(Op.getOperand(4));
-    SDValue Ops[] = {
-      Op.getOperand(0), // Chain
-      Op.getOperand(2), // vdata
-      Op.getOperand(3), // rsrc
-      Op.getOperand(4), // vindex
-      SDValue(),        // voffset -- will be set by setBufferOffsets
-      SDValue(),        // soffset -- will be set by setBufferOffsets
-      SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
-      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
-    };
-    setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
-    EVT VT = Op.getValueType();
-
-    auto *M = cast<MemSDNode>(Op);
-    unsigned Opcode = 0;
-
-    switch (IntrID) {
-    case Intrinsic::amdgcn_buffer_atomic_swap:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_add:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_sub:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_csub:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_smin:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_umin:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_smax:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_umax:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_and:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_or:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_xor:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
-      break;
-    case Intrinsic::amdgcn_buffer_atomic_fadd:
-      Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
-      break;
-    default:
-      llvm_unreachable("unhandled atomic opcode");
-    }
-
-    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
-                                   M->getMemOperand());
-  }
   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
@@ -9092,29 +8931,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return lowerStructBufferAtomicIntrin(Op, DAG,
                                          AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
 
-  case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
-    unsigned Slc = Op.getConstantOperandVal(7);
-    unsigned IdxEn = getIdxEn(Op.getOperand(5));
-    SDValue Ops[] = {
-      Op.getOperand(0), // Chain
-      Op.getOperand(2), // src
-      Op.getOperand(3), // cmp
-      Op.getOperand(4), // rsrc
-      Op.getOperand(5), // vindex
-      SDValue(),        // voffset -- will be set by setBufferOffsets
-      SDValue(),        // soffset -- will be set by setBufferOffsets
-      SDValue(),        // offset -- will be set by setBufferOffsets
-      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
-      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
-    };
-    setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
-
-    EVT VT = Op.getValueType();
-    auto *M = cast<MemSDNode>(Op);
-
-    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
-                                   Op->getVTList(), Ops, VT, M->getMemOperand());
-  }
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -9557,34 +9373,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     return SDValue();
   };
-  case Intrinsic::amdgcn_tbuffer_store: {
-    SDValue VData = Op.getOperand(2);
-    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
-    if (IsD16)
-      VData = handleD16VData(VData, DAG);
-    unsigned Dfmt = Op.getConstantOperandVal(8);
-    unsigned Nfmt = Op.getConstantOperandVal(9);
-    unsigned Glc = Op.getConstantOperandVal(10);
-    unsigned Slc = Op.getConstantOperandVal(11);
-    unsigned IdxEn = getIdxEn(Op.getOperand(4));
-    SDValue Ops[] = {
-      Chain,
-      VData,             // vdata
-      Op.getOperand(3),  // rsrc
-      Op.getOperand(4),  // vindex
-      Op.getOperand(5),  // voffset
-      Op.getOperand(6),  // soffset
-      Op.getOperand(7),  // offset
-      DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
-      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
-    };
-    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
-                           AMDGPUISD::TBUFFER_STORE_FORMAT;
-    MemSDNode *M = cast<MemSDNode>(Op);
-    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
-                                   M->getMemoryVT(), M->getMemOperand());
-  }
 
   case Intrinsic::amdgcn_struct_tbuffer_store:
   case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
@@ -9642,42 +9430,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                    M->getMemoryVT(), M->getMemOperand());
   }
 
-  case Intrinsic::amdgcn_buffer_store:
-  case Intrinsic::amdgcn_buffer_store_format: {
-    SDValue VData = Op.getOperand(2);
-    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
-    if (IsD16)
-      VData = handleD16VData(VData, DAG);
-    unsigned Glc = Op.getConstantOperandVal(6);
-    unsigned Slc = Op.getConstantOperandVal(7);
-    unsigned IdxEn = getIdxEn(Op.getOperand(4));
-    SDValue Ops[] = {
-      Chain,
-      VData,
-      Op.getOperand(3), // rsrc
-      Op.getOperand(4), // vindex
-      SDValue(), // voffset -- will be set by setBufferOffsets
-      SDValue(), // soffset -- will be set by setBufferOffsets
-      SDValue(), // offset -- will be set by setBufferOffsets
-      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
-      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
-    };
-    setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
-    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
-                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
-    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
-    MemSDNode *M = cast<MemSDNode>(Op);
-
-    // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
-    EVT VDataType = VData.getValueType().getScalarType();
-    if (VDataType == MVT::i8 || VDataType == MVT::i16)
-      return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
-
-    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
-                                   M->getMemoryVT(), M->getMemOperand());
-  }
-
   case Intrinsic::amdgcn_raw_buffer_store:
   case Intrinsic::amdgcn_raw_ptr_buffer_store:
   case Intrinsic::amdgcn_raw_buffer_store_format:
@@ -10083,8 +9835,8 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
   return {N0, SDValue(C1, 0)};
 }
 
-// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
-// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
 // pointed to by Offsets.
 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
                                         SelectionDAG &DAG, SDValue *Offsets,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4c02bb1..1f198a9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -253,9 +253,9 @@ public:
   bool shouldExpandVectorDynExt(SDNode *N) const;
 
 private:
-  // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
-  // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
-  // pointed to by Offsets.
+  // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+  // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
+  // array pointed to by Offsets.
   void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
                         SDValue *Offsets, Align Alignment = Align(4)) const;
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index c18892a..2bfee45 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -102,9 +102,14 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
                         const CCValAssign &VA) override {
-    // If we're passing an f32 value into an i64, anyextend before copying.
-    if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
-      ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0);
+    // If we're passing a smaller fp value into a larger integer register,
+    // anyextend before copying.
+    if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) ||
+        ((VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::i64) &&
+         VA.getValVT() == MVT::f16)) {
+      LLT DstTy = LLT::scalar(VA.getLocVT().getSizeInBits());
+      ValVReg = MIRBuilder.buildAnyExt(DstTy, ValVReg).getReg(0);
+    }
 
     Register ExtReg = extendRegister(ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -340,7 +345,7 @@ static bool isSupportedArgumentType(Type *T, const RISCVSubtarget &Subtarget,
   // supported yet.
   if (T->isIntegerTy())
     return T->getIntegerBitWidth() <= Subtarget.getXLen() * 2;
-  if (T->isFloatTy() || T->isDoubleTy())
+  if (T->isHalfTy() || T->isFloatTy() || T->isDoubleTy())
     return true;
   if (T->isPointerTy())
     return true;
@@ -361,7 +366,7 @@ static bool isSupportedReturnType(Type *T, const RISCVSubtarget &Subtarget,
   // supported yet.
   if (T->isIntegerTy())
     return T->getIntegerBitWidth() <= Subtarget.getXLen() * 2;
-  if (T->isFloatTy() || T->isDoubleTy())
+  if (T->isHalfTy() || T->isFloatTy() || T->isDoubleTy())
     return true;
   if (T->isPointerTy())
     return true;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index da8daa5..a091380 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -849,6 +849,8 @@ const TargetRegisterClass *RISCVInstructionSelector::getRegClassForTypeOnBank(
   }
 
   if (RB.getID() == RISCV::FPRBRegBankID) {
+    if (Ty.getSizeInBits() == 16)
+      return &RISCV::FPR16RegClass;
     if (Ty.getSizeInBits() == 32)
       return &RISCV::FPR32RegClass;
     if (Ty.getSizeInBits() == 64)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 010e07f..4cdf08a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -943,6 +943,9 @@ def : InstAlias<"ret",                   (JALR      X0,      X1, 0), 4>;
 def : InstAlias<"jr $rs, $offset",        (JALR      X0, GPR:$rs, simm12:$offset), 0>;
 def : InstAlias<"jalr $rs, $offset",      (JALR      X1, GPR:$rs, simm12:$offset), 0>;
 def : InstAlias<"jalr $rd, $rs, $offset", (JALR GPR:$rd, GPR:$rs, simm12:$offset), 0>;
+def : InstAlias<"jr (${rs})",             (JALR      X0, GPR:$rs, 0), 0>;
+def : InstAlias<"jalr (${rs})",           (JALR      X1, GPR:$rs, 0), 0>;
+def : InstAlias<"jalr $rd, (${rs})",      (JALR GPR:$rd, GPR:$rs, 0), 0>;
 
 def : InstAlias<"fence", (FENCE 0xF, 0xF)>; // 0xF == iorw
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 603c198..a206974 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -67,11 +67,6 @@
 ///   that terminology in code frequently refers to these as "TA" which is
 ///   confusing.  We're in the process of migrating away from this
 ///   representation.
-/// * _TU w/o policy operand -- Has a passthrough operand, and always
-///   represents the tail undisturbed state.
-/// * _TU w/policy operand - Can represent all three policy states.  If
-///   passthrough is IMPLICIT_DEF (or NoReg), then represents "undefined".
-///   Otherwise, policy operand and tablegen flags drive the interpretation.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -1234,11 +1229,11 @@ class VPseudoBinaryNoMask<VReg RetClass,
   let HasSEWOp = 1;
 }
 
-class VPseudoBinaryNoMaskTU<VReg RetClass,
-                            VReg Op1Class,
-                            DAGOperand Op2Class,
-                            string Constraint,
-                            int TargetConstraintType = 1> :
+class VPseudoBinaryNoMaskPolicy<VReg RetClass,
+                                VReg Op1Class,
+                                DAGOperand Op2Class,
+                                string Constraint,
+                                int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl,
                   ixlenimm:$sew, ixlenimm:$policy), []>,
@@ -1373,23 +1368,6 @@ class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasSEWOp = 1;
 }
 
-class VPseudoBinaryMask<VReg RetClass,
-                        RegisterClass Op1Class,
-                        DAGOperand Op2Class,
-                        string Constraint> :
-      Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
-                  Op1Class:$rs2, Op2Class:$rs1,
-                  VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
-      RISCVVPseudo {
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
-  let HasVLOp = 1;
-  let HasSEWOp = 1;
-}
-
 class VPseudoBinaryMaskPolicy<VReg RetClass,
                               RegisterClass Op1Class,
                               DAGOperand Op2Class,
@@ -1413,8 +1391,7 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
 
 class VPseudoTernaryMaskPolicy<VReg RetClass,
                                RegisterClass Op1Class,
-                               DAGOperand Op2Class,
-                               string Constraint> :
+                               DAGOperand Op2Class> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge,
                   Op1Class:$rs2, Op2Class:$rs1,
@@ -1423,7 +1400,7 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = "$rd = $merge";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1431,8 +1408,7 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
 
 class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
                                            RegisterClass Op1Class,
-                                           DAGOperand Op2Class,
-                                           string Constraint> :
+                                           DAGOperand Op2Class> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge,
                   Op1Class:$rs2, Op2Class:$rs1,
@@ -1443,7 +1419,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = "$rd = $merge";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1451,7 +1427,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
   let UsesVXRM = 0;
 }
 
-// Like VPseudoBinaryMask, but output can be V0.
+// Like VPseudoBinaryMaskPolicy, but output can be V0 and there is no policy.
 class VPseudoBinaryMOutMask<VReg RetClass,
                             RegisterClass Op1Class,
                             DAGOperand Op2Class,
@@ -1472,8 +1448,8 @@ class VPseudoBinaryMOutMask<VReg RetClass,
   let UsesMaskPolicy = 1;
 }
 
-// Special version of VPseudoBinaryMask where we pretend the first source is
-// tied to the destination so we can workaround the earlyclobber constraint.
+// Special version of VPseudoBinaryMaskPolicy where we pretend the first source
+// is tied to the destination so we can workaround the earlyclobber constraint.
 // This allows maskedoff and rs2 to be the same register.
 class VPseudoTiedBinaryMask<VReg RetClass,
                             DAGOperand Op2Class,
@@ -1521,13 +1497,13 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
   let UsesVXRM = 0;
 }
 
-class VPseudoBinaryCarryIn<VReg RetClass,
-                           VReg Op1Class,
-                           DAGOperand Op2Class,
-                           LMULInfo MInfo,
-                           bit CarryIn,
-                           string Constraint,
-                           int TargetConstraintType = 1> :
+class VPseudoBinaryCarry<VReg RetClass,
+                         VReg Op1Class,
+                         DAGOperand Op2Class,
+                         LMULInfo MInfo,
+                         bit CarryIn,
+                         string Constraint,
+                         int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              !if(CarryIn,
                 (ins Op1Class:$rs2, Op2Class:$rs1,
@@ -1549,20 +1525,15 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
                                VReg Op1Class,
                                DAGOperand Op2Class,
                                LMULInfo MInfo,
-                               bit CarryIn,
-                               string Constraint,
                                int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             !if(CarryIn,
-                (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1,
-                     VMV0:$carry, AVL:$vl, ixlenimm:$sew),
-                (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1,
-                     AVL:$vl, ixlenimm:$sew)), []>,
+             (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1,
+                  VMV0:$carry, AVL:$vl, ixlenimm:$sew), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = "$rd = $merge";
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1589,7 +1560,7 @@ class VPseudoTernaryNoMask<VReg RetClass,
 class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
                                      RegisterClass Op1Class,
                                      DAGOperand Op2Class,
-                                     string Constraint,
+                                     string Constraint = "",
                                      int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
@@ -1608,7 +1579,7 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
 class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass,
                                                  RegisterClass Op1Class,
                                                  DAGOperand Op2Class,
-                                                 string Constraint,
+                                                 string Constraint = "",
                                                  int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
@@ -2143,8 +2114,8 @@ multiclass VPseudoBinary<VReg RetClass,
                          bit Commutable = 0> {
   let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in {
     defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX);
-    def suffix : VPseudoBinaryNoMaskTU<RetClass, Op1Class, Op2Class,
-                                       Constraint, TargetConstraintType>;
+    def suffix : VPseudoBinaryNoMaskPolicy<RetClass, Op1Class, Op2Class,
+                                           Constraint, TargetConstraintType>;
     def suffix # "_MASK" : VPseudoBinaryMaskPolicy<RetClass, Op1Class, Op2Class,
                                                    Constraint, TargetConstraintType>,
                            RISCVMaskedPseudo<MaskIdx=3>;
@@ -2199,11 +2170,11 @@ multiclass VPseudoBinaryEmul<VReg RetClass,
                              LMULInfo lmul,
                              LMULInfo emul,
                              string Constraint = "",
-                             int sew = 0> {
+                             int sew> {
   let VLMul = lmul.value, SEW=sew in {
     defvar suffix = !if(sew, "_" # lmul.MX # "_E" # sew, "_" # lmul.MX);
-    def suffix # "_" # emul.MX : VPseudoBinaryNoMaskTU<RetClass, Op1Class, Op2Class,
-                                                       Constraint>;
+    def suffix # "_" # emul.MX : VPseudoBinaryNoMaskPolicy<RetClass, Op1Class, Op2Class,
+                                                           Constraint>;
     def suffix # "_" # emul.MX # "_MASK" : VPseudoBinaryMaskPolicy<RetClass, Op1Class, Op2Class,
                                                                           Constraint>,
                                                   RISCVMaskedPseudo<MaskIdx=3>;
@@ -2250,18 +2221,13 @@ multiclass VPseudoBinaryV_VV_RM<LMULInfo m, string Constraint = "", bit Commutab
                                        Commutable=Commutable>;
 }
 
-// Similar to VPseudoBinaryV_VV, but uses MxListF.
-multiclass VPseudoBinaryFV_VV<LMULInfo m, string Constraint = "", int sew = 0> {
-  defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint, sew>;
-}
-
-multiclass VPseudoBinaryFV_VV_RM<LMULInfo m, string Constraint = "", int sew = 0> {
+multiclass VPseudoBinaryFV_VV_RM<LMULInfo m, int sew> {
   defm _VV : VPseudoBinaryRoundingMode<m.vrclass, m.vrclass, m.vrclass, m,
-                                       Constraint, sew,
-                                       UsesVXRM=0>;
+                                       "", sew, UsesVXRM=0>;
 }
 
-multiclass VPseudoVGTR_EI16_VV<string Constraint = ""> {
+multiclass VPseudoVGTR_EI16_VV {
+  defvar constraint = "@earlyclobber $rd";
   foreach m = MxList in {
     defvar mx = m.MX;
     foreach sew = EEWList in {
@@ -2275,7 +2241,7 @@ multiclass VPseudoVGTR_EI16_VV<string Constraint = ""> {
         foreach e = sews in {
           defm _VV
               : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul,
-                                  Constraint, e>,
+                                  constraint, e>,
                 SchedBinary<"WriteVRGatherEI16VV", "ReadVRGatherEI16VV_data",
                             "ReadVRGatherEI16VV_index", mx, e, forceMergeOpRead=true>;
         }
@@ -2300,14 +2266,14 @@ multiclass VPseudoVSLD1_VX<string Constraint = ""> {
   }
 }
 
-multiclass VPseudoBinaryV_VF<LMULInfo m, FPR_Info f, string Constraint = "", int sew = 0> {
+multiclass VPseudoBinaryV_VF<LMULInfo m, FPR_Info f, int sew> {
   defm "_V" # f.FX : VPseudoBinary<m.vrclass, m.vrclass,
-                                   f.fprclass, m, Constraint, sew>;
+                                   f.fprclass, m, "", sew>;
 }
 
-multiclass VPseudoBinaryV_VF_RM<LMULInfo m, FPR_Info f, string Constraint = "", int sew = 0> {
+multiclass VPseudoBinaryV_VF_RM<LMULInfo m, FPR_Info f, int sew> {
   defm "_V" # f.FX : VPseudoBinaryRoundingMode<m.vrclass, m.vrclass,
-                                               f.fprclass, m, Constraint, sew,
+                                               f.fprclass, m, "", sew,
                                                UsesVXRM=0>;
 }
 
@@ -2322,11 +2288,11 @@ multiclass VPseudoVSLD1_VF<string Constraint = ""> {
   }
 }
 
-multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, LMULInfo m, string Constraint = ""> {
+multiclass VPseudoBinaryV_VI<Operand ImmType, LMULInfo m, string Constraint = ""> {
   defm _VI : VPseudoBinary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
 }
 
-multiclass VPseudoBinaryV_VI_RM<Operand ImmType = simm5, LMULInfo m, string Constraint = ""> {
+multiclass VPseudoBinaryV_VI_RM<Operand ImmType, LMULInfo m, string Constraint = ""> {
   defm _VI : VPseudoBinaryRoundingMode<m.vrclass, m.vrclass, ImmType, m, Constraint>;
 }
 
@@ -2353,7 +2319,7 @@ multiclass VPseudoBinaryW_VV<LMULInfo m, bit Commutable = 0> {
                            Commutable=Commutable>;
 }
 
-multiclass VPseudoBinaryW_VV_RM<LMULInfo m, int sew = 0> {
+multiclass VPseudoBinaryW_VV_RM<LMULInfo m, int sew> {
   defm _VV : VPseudoBinaryRoundingMode<m.wvrclass, m.vrclass, m.vrclass, m,
                                       "@earlyclobber $rd", sew, UsesVXRM=0,
                                       TargetConstraintType=3>;
@@ -2369,13 +2335,7 @@ multiclass VPseudoBinaryW_VI<Operand ImmType, LMULInfo m> {
                              "@earlyclobber $rd", TargetConstraintType=3>;
 }
 
-multiclass VPseudoBinaryW_VF<LMULInfo m, FPR_Info f> {
-  defm "_V" # f.FX : VPseudoBinary<m.wvrclass, m.vrclass,
-                                   f.fprclass, m,
-                                   "@earlyclobber $rd">;
-}
-
-multiclass VPseudoBinaryW_VF_RM<LMULInfo m, FPR_Info f, int sew = 0> {
+multiclass VPseudoBinaryW_VF_RM<LMULInfo m, FPR_Info f, int sew> {
   defm "_V" # f.FX : VPseudoBinaryRoundingMode<m.wvrclass, m.vrclass,
                                                f.fprclass, m,
                                                "@earlyclobber $rd", sew,
@@ -2390,7 +2350,7 @@ multiclass VPseudoBinaryW_WV<LMULInfo m> {
                                "@earlyclobber $rd", TargetConstraintType=3>;
 }
 
-multiclass VPseudoBinaryW_WV_RM<LMULInfo m, int sew = 0> {
+multiclass VPseudoBinaryW_WV_RM<LMULInfo m, int sew> {
   defm _WV : VPseudoBinaryRoundingMode<m.wvrclass, m.wvrclass, m.vrclass, m,
                                        "@earlyclobber $rd", sew, UsesVXRM = 0,
                                        TargetConstraintType = 3>;
@@ -2403,12 +2363,7 @@ multiclass VPseudoBinaryW_WX<LMULInfo m> {
   defm "_WX" : VPseudoBinary<m.wvrclass, m.wvrclass, GPR, m, /*Constraint*/ "", TargetConstraintType=3>;
 }
 
-multiclass VPseudoBinaryW_WF<LMULInfo m, FPR_Info f, int TargetConstraintType = 1> {
-  defm "_W" # f.FX : VPseudoBinary<m.wvrclass, m.wvrclass,
-                                   f.fprclass, m, /*Constraint*/ "", TargetConstraintType=TargetConstraintType>;
-}
-
-multiclass VPseudoBinaryW_WF_RM<LMULInfo m, FPR_Info f, int sew = 0> {
+multiclass VPseudoBinaryW_WF_RM<LMULInfo m, FPR_Info f, int sew> {
   defm "_W" # f.FX : VPseudoBinaryRoundingMode<m.wvrclass, m.wvrclass,
                                                f.fprclass, m,
                                                Constraint="",
@@ -2470,35 +2425,32 @@ multiclass VPseudoBinaryV_VM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                              int TargetConstraintType = 1> {
   let isCommutable = Commutable in
   def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX :
-    VPseudoBinaryCarryIn<!if(CarryOut, VR,
-                         !if(!and(CarryIn, !not(CarryOut)),
-                             GetVRegNoV0<m.vrclass>.R, m.vrclass)),
-                         m.vrclass, m.vrclass, m, CarryIn, Constraint, TargetConstraintType>;
+    VPseudoBinaryCarry<!if(CarryOut, VR,
+                       !if(!and(CarryIn, !not(CarryOut)),
+                           GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                       m.vrclass, m.vrclass, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_VM<LMULInfo m, int TargetConstraintType = 1,
-                                 bit Commutable = 0> {
+multiclass VPseudoTiedBinaryV_VM<LMULInfo m, bit Commutable = 0> {
   let isCommutable = Commutable in
   def "_VVM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, m.vrclass, m, 1, "",
-                             TargetConstraintType>;
+                             m.vrclass, m.vrclass, m>;
 }
 
 multiclass VPseudoBinaryV_XM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                              string Constraint = "", int TargetConstraintType = 1> {
   def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX :
-    VPseudoBinaryCarryIn<!if(CarryOut, VR,
-                         !if(!and(CarryIn, !not(CarryOut)),
-                             GetVRegNoV0<m.vrclass>.R, m.vrclass)),
-                         m.vrclass, GPR, m, CarryIn, Constraint, TargetConstraintType>;
+    VPseudoBinaryCarry<!if(CarryOut, VR,
+                       !if(!and(CarryIn, !not(CarryOut)),
+                           GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                       m.vrclass, GPR, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_XM<LMULInfo m, int TargetConstraintType = 1> {
+multiclass VPseudoTiedBinaryV_XM<LMULInfo m> {
   def "_VXM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, GPR, m, 1, "",
-                             TargetConstraintType>;
+                             m.vrclass, GPR, m>;
 }
 
 multiclass VPseudoVMRG_FM {
@@ -2507,8 +2459,7 @@ multiclass VPseudoVMRG_FM {
       defvar mx = m.MX;
       def "_V" # f.FX # "M_" # mx
           : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, m.vrclass,
-                                     f.fprclass, m, CarryIn=1,
-                                     Constraint = "">,
+                                     f.fprclass, m>,
           SchedBinary<"WriteVFMergeV", "ReadVFMergeV", "ReadVFMergeF", mx,
                       forceMasked=1, forceMergeOpRead=true>;
     }
@@ -2518,16 +2469,16 @@ multiclass VPseudoVMRG_FM {
 multiclass VPseudoBinaryV_IM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                              string Constraint = "", int TargetConstraintType = 1> {
   def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX :
-    VPseudoBinaryCarryIn<!if(CarryOut, VR,
-                         !if(!and(CarryIn, !not(CarryOut)),
-                             GetVRegNoV0<m.vrclass>.R, m.vrclass)),
-                         m.vrclass, simm5, m, CarryIn, Constraint, TargetConstraintType>;
+    VPseudoBinaryCarry<!if(CarryOut, VR,
+                       !if(!and(CarryIn, !not(CarryOut)),
+                           GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                       m.vrclass, simm5, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
 multiclass VPseudoTiedBinaryV_IM<LMULInfo m> {
   def "_VIM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, simm5, m, 1, "">;
+                             m.vrclass, simm5, m>;
 }
 
 multiclass VPseudoUnaryVMV_V_X_I {
@@ -2716,80 +2667,79 @@ multiclass VPseudoBinaryM_VI<LMULInfo m, int TargetConstraintType = 1> {
                             !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""), TargetConstraintType>;
 }
 
-multiclass VPseudoVGTR_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVGTR_VV_VX_VI {
+  defvar constraint = "@earlyclobber $rd";
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VX<m, Constraint>,
+    defm "" : VPseudoBinaryV_VX<m, constraint>,
               SchedBinary<"WriteVRGatherVX", "ReadVRGatherVX_data",
                           "ReadVRGatherVX_index", mx, forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VI<ImmType, m, Constraint>,
+    defm "" : VPseudoBinaryV_VI<uimm5, m, constraint>,
               SchedUnary<"WriteVRGatherVI", "ReadVRGatherVI_data", mx,
                          forceMergeOpRead=true>;
 
     defvar sews = SchedSEWSet<mx>.val;
     foreach e = sews in {
-      defm "" : VPseudoBinaryV_VV<m, Constraint, e>,
+      defm "" : VPseudoBinaryV_VV<m, constraint, e>,
                 SchedBinary<"WriteVRGatherVV", "ReadVRGatherVV_data",
                               "ReadVRGatherVV_index", mx, e, forceMergeOpRead=true>;
     }
   }
 }
 
-multiclass VPseudoVSALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = "",
-                                 bit Commutable = 0> {
+multiclass VPseudoVSALU_VV_VX_VI<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m, Constraint, Commutable=Commutable>,
+    defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVSALUV", "ReadVSALUV", "ReadVSALUX", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VX<m, Constraint>,
+    defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVSALUX", "ReadVSALUV", "ReadVSALUX", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VI<ImmType, m, Constraint>,
+    defm "" : VPseudoBinaryV_VI<simm5, m>,
               SchedUnary<"WriteVSALUI", "ReadVSALUV", mx, forceMergeOpRead=true>;
   }
 }
 
 
-multiclass VPseudoVSHT_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVSHT_VV_VX_VI {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m, Constraint>,
+    defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVShiftV", "ReadVShiftV", "ReadVShiftV", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VX<m, Constraint>,
+    defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVShiftX", "ReadVShiftV", "ReadVShiftX", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VI<ImmType, m, Constraint>,
+    defm "" : VPseudoBinaryV_VI<uimm5, m>,
               SchedUnary<"WriteVShiftI", "ReadVShiftV", mx, forceMergeOpRead=true>;
   }
 }
 
-multiclass VPseudoVSSHT_VV_VX_VI_RM<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVSSHT_VV_VX_VI_RM {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV_RM<m, Constraint>,
+    defm "" : VPseudoBinaryV_VV_RM<m>,
               SchedBinary<"WriteVSShiftV", "ReadVSShiftV", "ReadVSShiftV", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VX_RM<m, Constraint>,
+    defm "" : VPseudoBinaryV_VX_RM<m>,
               SchedBinary<"WriteVSShiftX", "ReadVSShiftV", "ReadVSShiftX", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VI_RM<ImmType, m, Constraint>,
+    defm "" : VPseudoBinaryV_VI_RM<uimm5, m>,
               SchedUnary<"WriteVSShiftI", "ReadVSShiftV", mx, forceMergeOpRead=true>;
   }
 }
 
-multiclass VPseudoVALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = "",
-                                bit Commutable = 0> {
+multiclass VPseudoVALU_VV_VX_VI<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m, Constraint, Commutable=Commutable>,
+    defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
             SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
                         forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VX<m, Constraint>,
+    defm "" : VPseudoBinaryV_VX<m>,
             SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", mx,
                         forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VI<ImmType, m, Constraint>,
+    defm "" : VPseudoBinaryV_VI<simm5, m>,
             SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forceMergeOpRead=true>;
   }
 }
@@ -2866,14 +2816,14 @@ multiclass VPseudoVDIV_VV_VX {
 multiclass VPseudoVFMUL_VV_VF_RM {
   foreach m = MxListF in {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
-      defm "" : VPseudoBinaryFV_VV_RM<m, "", sew=e>,
+      defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFMulV", "ReadVFMulV", "ReadVFMulV", m.MX, e,
                             forceMergeOpRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
-      defm "" : VPseudoBinaryV_VF_RM<m, f, "", sew=f.SEW>,
+      defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFMulF", "ReadVFMulV", "ReadVFMulF", m.MX,
                             f.SEW, forceMergeOpRead=true>;
     }
@@ -2885,7 +2835,7 @@ multiclass VPseudoVFDIV_VV_VF_RM {
     defvar mx = m.MX;
     defvar sews = SchedSEWSet<mx, isF=1>.val;
     foreach e = sews in {
-      defm "" : VPseudoBinaryFV_VV_RM<m, "", e>,
+      defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFDivV", "ReadVFDivV", "ReadVFDivV", mx, e,
                             forceMergeOpRead=true>;
     }
@@ -2893,7 +2843,7 @@ multiclass VPseudoVFDIV_VV_VF_RM {
 
   foreach f = FPList in {
     foreach m = f.MxList in {
-      defm "" : VPseudoBinaryV_VF_RM<m, f, "", f.SEW>,
+      defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFDivF", "ReadVFDivV", "ReadVFDivF", m.MX, f.SEW,
                             forceMergeOpRead=true>;
     }
@@ -2903,7 +2853,7 @@ multiclass VPseudoVFDIV_VV_VF_RM {
 multiclass VPseudoVFRDIV_VF_RM {
   foreach f = FPList in {
     foreach m = f.MxList in {
-      defm "" : VPseudoBinaryV_VF_RM<m, f, "", f.SEW>,
+      defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFDivF", "ReadVFDivV", "ReadVFDivF", m.MX, f.SEW,
                             forceMergeOpRead=true>;
     }
@@ -2924,7 +2874,7 @@ multiclass VPseudoVALU_VV_VX {
 multiclass VPseudoVSGNJ_VV_VF {
   foreach m = MxListF in {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
-    defm "" : VPseudoBinaryFV_VV<m, sew=e>,
+    defm "" : VPseudoBinaryV_VV<m, sew=e>,
               SchedBinary<"WriteVFSgnjV", "ReadVFSgnjV", "ReadVFSgnjV", m.MX,
                           e, forceMergeOpRead=true>;
   }
@@ -2941,7 +2891,7 @@ multiclass VPseudoVSGNJ_VV_VF {
 multiclass VPseudoVMAX_VV_VF {
   foreach m = MxListF in {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
-      defm "" : VPseudoBinaryFV_VV<m, sew=e>,
+      defm "" : VPseudoBinaryV_VV<m, sew=e>,
                 SchedBinary<"WriteVFMinMaxV", "ReadVFMinMaxV", "ReadVFMinMaxV",
                             m.MX, e, forceMergeOpRead=true>;
   }
@@ -2958,14 +2908,14 @@ multiclass VPseudoVMAX_VV_VF {
 multiclass VPseudoVALU_VV_VF_RM {
   foreach m = MxListF in {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
-      defm "" : VPseudoBinaryFV_VV_RM<m, "", sew=e>,
+      defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFALUV", "ReadVFALUV", "ReadVFALUV", m.MX, e,
                             forceMergeOpRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
-      defm "" : VPseudoBinaryV_VF_RM<m, f, "", sew=f.SEW>,
+      defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
                             f.SEW, forceMergeOpRead=true>;
     }
@@ -2975,20 +2925,20 @@ multiclass VPseudoVALU_VV_VF_RM {
 multiclass VPseudoVALU_VF_RM {
   foreach f = FPList in {
     foreach m = f.MxList in {
-      defm "" : VPseudoBinaryV_VF_RM<m, f, "", sew=f.SEW>,
+      defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
                             f.SEW, forceMergeOpRead=true>;
     }
   }
 }
 
-multiclass VPseudoVALU_VX_VI<Operand ImmType = simm5> {
+multiclass VPseudoVALU_VX_VI {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VI<ImmType, m>,
+    defm "" : VPseudoBinaryV_VI<simm5, m>,
               SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forceMergeOpRead=true>;
   }
 }
@@ -3084,17 +3034,17 @@ multiclass VPseudoVMRG_VM_XM_IM {
     defvar mx = m.MX;
     def "_VVM" # "_" # m.MX:
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                               m.vrclass, m.vrclass, m, 1, "">,
+                               m.vrclass, m.vrclass, m>,
       SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx,
                           forceMergeOpRead=true>;
     def "_VXM" # "_" # m.MX:
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                               m.vrclass, GPR, m, 1, "">,
+                               m.vrclass, GPR, m>,
       SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx,
                           forceMergeOpRead=true>;
     def "_VIM" # "_" # m.MX:
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                               m.vrclass, simm5, m, 1, "">,
+                               m.vrclass, simm5, m>,
       SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx,
                           forceMergeOpRead=true>;
   }
@@ -3127,57 +3077,63 @@ multiclass VPseudoVCALU_VM_XM {
   }
 }
 
-multiclass VPseudoVCALUM_VM_XM_IM<string Constraint> {
+multiclass VPseudoVCALUM_VM_XM_IM {
+  defvar constraint = "@earlyclobber $rd";
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=Constraint,
+    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
                                 Commutable=1, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx, forceMasked=1,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=Constraint, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx, forceMasked=1,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=1, Constraint=Constraint, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=1, Constraint=constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx, forceMasked=1,
                           forceMergeOpRead=true>;
   }
 }
 
-multiclass VPseudoVCALUM_VM_XM<string Constraint> {
+multiclass VPseudoVCALUM_VM_XM {
+  defvar constraint = "@earlyclobber $rd";
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=Constraint, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
+                                TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx, forceMasked=1,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=Constraint, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
+                                TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx, forceMasked=1,
                           forceMergeOpRead=true>;
   }
 }
 
-multiclass VPseudoVCALUM_V_X_I<string Constraint> {
+multiclass VPseudoVCALUM_V_X_I {
+  defvar constraint = "@earlyclobber $rd";
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=Constraint,
+    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=constraint,
                                 Commutable=1, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=Constraint, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=0, Constraint=Constraint>,
+    defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=0, Constraint=constraint>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx,
                           forceMergeOpRead=true>;
   }
 }
 
-multiclass VPseudoVCALUM_V_X<string Constraint> {
+multiclass VPseudoVCALUM_V_X {
+  defvar constraint = "@earlyclobber $rd";
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=Constraint, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=Constraint, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
                           forceMergeOpRead=true>;
   }
@@ -3217,34 +3173,28 @@ multiclass VPseudoTernaryWithTailPolicy<VReg RetClass,
                                           RegisterClass Op1Class,
                                           DAGOperand Op2Class,
                                           LMULInfo MInfo,
-                                          int sew,
-                                          string Constraint = "",
-                                          bit Commutable = 0> {
+                                          int sew> {
   let VLMul = MInfo.value, SEW=sew in {
     defvar mx = MInfo.MX;
-    let isCommutable = Commutable in
-    def "_" # mx # "_E" # sew : VPseudoTernaryNoMaskWithPolicy<RetClass, Op1Class, Op2Class, Constraint>;
-    def "_" # mx # "_E" # sew # "_MASK" : VPseudoTernaryMaskPolicy<RetClass, Op1Class, Op2Class, Constraint>,
+    def "_" # mx # "_E" # sew : VPseudoTernaryNoMaskWithPolicy<RetClass, Op1Class, Op2Class>;
+    def "_" # mx # "_E" # sew # "_MASK" : VPseudoTernaryMaskPolicy<RetClass, Op1Class, Op2Class>,
                                           RISCVMaskedPseudo<MaskIdx=3, MaskAffectsRes=true>;
   }
 }
 
 multiclass VPseudoTernaryWithTailPolicyRoundingMode<VReg RetClass,
-                                          RegisterClass Op1Class,
-                                          DAGOperand Op2Class,
-                                          LMULInfo MInfo,
-                                          int sew,
-                                          string Constraint = "",
-                                          bit Commutable = 0> {
+                                                    RegisterClass Op1Class,
+                                                    DAGOperand Op2Class,
+                                                    LMULInfo MInfo,
+                                                    int sew> {
   let VLMul = MInfo.value, SEW=sew in {
     defvar mx = MInfo.MX;
-    let isCommutable = Commutable in
     def "_" # mx # "_E" # sew
         : VPseudoTernaryNoMaskWithPolicyRoundingMode<RetClass, Op1Class,
-                                                     Op2Class, Constraint>;
+                                                     Op2Class>;
     def "_" # mx # "_E" # sew # "_MASK"
         : VPseudoTernaryMaskPolicyRoundingMode<RetClass, Op1Class,
-                                               Op2Class, Constraint>,
+                                               Op2Class>,
           RISCVMaskedPseudo<MaskIdx=3, MaskAffectsRes=true>;
   }
 }
@@ -3288,26 +3238,26 @@ multiclass VPseudoTernaryWithPolicyRoundingMode<VReg RetClass,
   }
 }
 
-multiclass VPseudoTernaryV_VV_AAXA<LMULInfo m, string Constraint = ""> {
+multiclass VPseudoTernaryV_VV_AAXA<LMULInfo m> {
   defm _VV : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, m.vrclass, m,
-                                      Constraint, Commutable=1>;
+                                      Commutable=1>;
 }
 
-multiclass VPseudoTernaryV_VV_AAXA_RM<LMULInfo m, string Constraint = "", int sew = 0> {
+multiclass VPseudoTernaryV_VV_AAXA_RM<LMULInfo m, int sew> {
   defm _VV : VPseudoTernaryWithPolicyRoundingMode<m.vrclass, m.vrclass, m.vrclass, m,
-                                                  Constraint, sew, Commutable=1>;
+                                                  sew=sew, Commutable=1>;
 }
 
-multiclass VPseudoTernaryV_VX_AAXA<LMULInfo m, string Constraint = ""> {
+multiclass VPseudoTernaryV_VX_AAXA<LMULInfo m> {
   defm "_VX" : VPseudoTernaryWithPolicy<m.vrclass, GPR, m.vrclass, m,
-                                        Constraint, Commutable=1>;
+                                        Commutable=1>;
 }
 
 multiclass VPseudoTernaryV_VF_AAXA_RM<LMULInfo m, FPR_Info f,
-                                      string Constraint = "", int sew = 0> {
+                                      int sew> {
   defm "_V" # f.FX : VPseudoTernaryWithPolicyRoundingMode<m.vrclass, f.fprclass,
-                                                          m.vrclass, m, Constraint,
-                                                          sew, Commutable=1>;
+                                                          m.vrclass, m,
+                                                          sew=sew, Commutable=1>;
 }
 
 multiclass VPseudoTernaryW_VV<LMULInfo m, bit Commutable = 0> {
@@ -3316,24 +3266,24 @@ multiclass VPseudoTernaryW_VV<LMULInfo m, bit Commutable = 0> {
                                       constraint, Commutable=Commutable, TargetConstraintType=3>;
 }
 
-multiclass VPseudoTernaryW_VV_RM<LMULInfo m, int sew = 0> {
+multiclass VPseudoTernaryW_VV_RM<LMULInfo m, int sew> {
   defvar constraint = "@earlyclobber $rd";
   defm _VV : VPseudoTernaryWithPolicyRoundingMode<m.wvrclass, m.vrclass, m.vrclass, m,
-                                                  constraint, sew, /* Commutable */ 0,
+                                                  constraint, sew,
                                                   TargetConstraintType=3>;
 }
 
 multiclass VPseudoTernaryW_VX<LMULInfo m> {
   defvar constraint = "@earlyclobber $rd";
   defm "_VX" : VPseudoTernaryWithPolicy<m.wvrclass, GPR, m.vrclass, m,
-                                        constraint, /*Commutable*/ 0, TargetConstraintType=3>;
+                                        constraint, TargetConstraintType=3>;
 }
 
-multiclass VPseudoTernaryW_VF_RM<LMULInfo m, FPR_Info f, int sew = 0> {
+multiclass VPseudoTernaryW_VF_RM<LMULInfo m, FPR_Info f, int sew> {
   defvar constraint = "@earlyclobber $rd";
   defm "_V" # f.FX : VPseudoTernaryWithPolicyRoundingMode<m.wvrclass, f.fprclass,
                                                           m.vrclass, m, constraint,
-                                                          sew, /* Commutable */ 0,
+                                                          sew=sew,
                                                           TargetConstraintType=3>;
 }
 
@@ -3353,48 +3303,47 @@ multiclass VPseudoVSLDV_VX<LMULInfo m, string Constraint = ""> {
   defm _VX : VPseudoVSLDVWithPolicy<m.vrclass, m.vrclass, GPR, m, Constraint>;
 }
 
-multiclass VPseudoVSLDV_VI<Operand ImmType = simm5, LMULInfo m, string Constraint = ""> {
-  defm _VI : VPseudoVSLDVWithPolicy<m.vrclass, m.vrclass, ImmType, m, Constraint>;
+multiclass VPseudoVSLDV_VI<LMULInfo m, string Constraint = ""> {
+  defm _VI : VPseudoVSLDVWithPolicy<m.vrclass, m.vrclass, uimm5, m, Constraint>;
 }
 
-multiclass VPseudoVMAC_VV_VX_AAXA<string Constraint = ""> {
+multiclass VPseudoVMAC_VV_VX_AAXA {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoTernaryV_VV_AAXA<m, Constraint>,
+    defm "" : VPseudoTernaryV_VV_AAXA<m>,
               SchedTernary<"WriteVIMulAddV", "ReadVIMulAddV", "ReadVIMulAddV",
                            "ReadVIMulAddV", mx>;
-    defm "" : VPseudoTernaryV_VX_AAXA<m, Constraint>,
+    defm "" : VPseudoTernaryV_VX_AAXA<m>,
               SchedTernary<"WriteVIMulAddX", "ReadVIMulAddV", "ReadVIMulAddX",
                            "ReadVIMulAddV", mx>;
   }
 }
 
-multiclass VPseudoVMAC_VV_VF_AAXA_RM<string Constraint = ""> {
+multiclass VPseudoVMAC_VV_VF_AAXA_RM {
   foreach m = MxListF in {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
-      defm "" : VPseudoTernaryV_VV_AAXA_RM<m, Constraint, sew=e>,
+      defm "" : VPseudoTernaryV_VV_AAXA_RM<m, sew=e>,
                 SchedTernary<"WriteVFMulAddV", "ReadVFMulAddV", "ReadVFMulAddV",
                              "ReadVFMulAddV", m.MX, e>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
-      defm "" : VPseudoTernaryV_VF_AAXA_RM<m, f, Constraint, sew=f.SEW>,
+      defm "" : VPseudoTernaryV_VF_AAXA_RM<m, f, sew=f.SEW>,
                 SchedTernary<"WriteVFMulAddF", "ReadVFMulAddV", "ReadVFMulAddF",
                              "ReadVFMulAddV", m.MX, f.SEW>;
     }
   }
 }
 
-multiclass VPseudoVSLD_VX_VI<Operand ImmType = simm5, bit slidesUp = false,
-                             string Constraint = ""> {
+multiclass VPseudoVSLD_VX_VI<bit slidesUp = false, string Constraint = ""> {
   defvar WriteSlideX = !if(slidesUp, "WriteVSlideUpX", "WriteVSlideDownX");
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoVSLDV_VX<m, Constraint>,
               SchedTernary<WriteSlideX, "ReadVISlideV", "ReadVISlideV",
                            "ReadVISlideX", mx>;
-    defm "" : VPseudoVSLDV_VI<ImmType, m, Constraint>,
+    defm "" : VPseudoVSLDV_VI<m, Constraint>,
               SchedBinary<"WriteVSlideI", "ReadVISlideV", "ReadVISlideV", mx>;
   }
 }
@@ -4181,15 +4130,15 @@ class VPatBinaryNoMaskTURoundingMode<string intrinsic_name,
                    GPR:$vl, sew, TU_MU)>;
 
 
-// Same as above but source operands are swapped.
-class VPatBinaryNoMaskSwapped<string intrinsic_name,
-                              string inst,
-                              ValueType result_type,
-                              ValueType op1_type,
-                              ValueType op2_type,
-                              int sew,
-                              VReg op1_reg_class,
-                              DAGOperand op2_kind> :
+// Same as VPatBinaryM but source operands are swapped.
+class VPatBinaryMSwapped<string intrinsic_name,
+                         string inst,
+                         ValueType result_type,
+                         ValueType op1_type,
+                         ValueType op2_type,
+                         int sew,
+                         VReg op1_reg_class,
+                         DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
                    (op2_type op2_kind:$rs2),
                    (op1_type op1_reg_class:$rs1),
@@ -4243,16 +4192,16 @@ class VPatBinaryMaskPolicy<string intrinsic_name,
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
 
-class VPatBinaryMaskTARoundingMode<string intrinsic_name,
-                                   string inst,
-                                   ValueType result_type,
-                                   ValueType op1_type,
-                                   ValueType op2_type,
-                                   ValueType mask_type,
-                                   int sew,
-                                   VReg result_reg_class,
-                                   VReg op1_reg_class,
-                                   DAGOperand op2_kind> :
+class VPatBinaryMaskPolicyRoundingMode<string intrinsic_name,
+                                       string inst,
+                                       ValueType result_type,
+                                       ValueType op1_type,
+                                       ValueType op2_type,
+                                       ValueType mask_type,
+                                       int sew,
+                                       VReg result_reg_class,
+                                       VReg op1_reg_class,
+                                       DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
@@ -4268,7 +4217,7 @@ class VPatBinaryMaskTARoundingMode<string intrinsic_name,
                    (XLenVT timm:$round),
                    GPR:$vl, sew, (XLenVT timm:$policy))>;
 
-// Same as above but source operands are swapped.
+// Same as VPatBinaryMask but source operands are swapped.
 class VPatBinaryMaskSwapped<string intrinsic_name,
                             string inst,
                             ValueType result_type,
@@ -4404,28 +4353,6 @@ class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
                    (XLenVT timm:$round),
                    GPR:$vl, sew, (XLenVT timm:$policy))>;
 
-class VPatTernaryNoMask<string intrinsic,
-                        string inst,
-                        string kind,
-                        ValueType result_type,
-                        ValueType op1_type,
-                        ValueType op2_type,
-                        int sew,
-                        LMULInfo vlmul,
-                        VReg result_reg_class,
-                        RegisterClass op1_reg_class,
-                        DAGOperand op2_kind> :
-  Pat<(result_type (!cast<Intrinsic>(intrinsic)
-                    (result_type result_reg_class:$rs3),
-                    (op1_type op1_reg_class:$rs1),
-                    (op2_type op2_kind:$rs2),
-                    VLOpFrag)),
-                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
-                    result_reg_class:$rs3,
-                    (op1_type op1_reg_class:$rs1),
-                    op2_kind:$rs2,
-                    GPR:$vl, sew)>;
-
 class VPatTernaryNoMaskTU<string intrinsic,
                           string inst,
                           string kind,
@@ -4521,31 +4448,6 @@ class VPatTernaryNoMaskWithPolicyRoundingMode<string intrinsic,
                     (XLenVT timm:$round),
                     GPR:$vl, log2sew, (XLenVT timm:$policy))>;
 
-class VPatTernaryMask<string intrinsic,
-                      string inst,
-                      string kind,
-                      ValueType result_type,
-                      ValueType op1_type,
-                      ValueType op2_type,
-                      ValueType mask_type,
-                      int sew,
-                      LMULInfo vlmul,
-                      VReg result_reg_class,
-                      RegisterClass op1_reg_class,
-                      DAGOperand op2_kind> :
-  Pat<(result_type (!cast<Intrinsic>(intrinsic#"_mask")
-                    (result_type result_reg_class:$rs3),
-                    (op1_type op1_reg_class:$rs1),
-                    (op2_type op2_kind:$rs2),
-                    (mask_type V0),
-                    VLOpFrag)),
-                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
-                    result_reg_class:$rs3,
-                    (op1_type op1_reg_class:$rs1),
-                    op2_kind:$rs2,
-                    (mask_type V0),
-                    GPR:$vl, sew)>;
-
 class VPatTernaryMaskPolicy<string intrinsic,
                             string inst,
                             string kind,
@@ -4813,23 +4715,23 @@ multiclass VPatBinaryRoundingMode<string intrinsic,
                                   DAGOperand op2_kind> {
   def : VPatBinaryNoMaskTURoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
                                        sew, result_reg_class, op1_reg_class, op2_kind>;
-  def : VPatBinaryMaskTARoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
-                                     mask_type, sew, result_reg_class, op1_reg_class,
-                                     op2_kind>;
+  def : VPatBinaryMaskPolicyRoundingMode<intrinsic, inst, result_type, op1_type, op2_type,
+                                         mask_type, sew, result_reg_class, op1_reg_class,
+                                         op2_kind>;
 }
 
-multiclass VPatBinarySwapped<string intrinsic,
-                      string inst,
-                      ValueType result_type,
-                      ValueType op1_type,
-                      ValueType op2_type,
-                      ValueType mask_type,
-                      int sew,
-                      VReg result_reg_class,
-                      VReg op1_reg_class,
-                      DAGOperand op2_kind> {
-  def : VPatBinaryNoMaskSwapped<intrinsic, inst, result_type, op1_type, op2_type,
-                                sew, op1_reg_class, op2_kind>;
+multiclass VPatBinaryMSwapped<string intrinsic,
+                              string inst,
+                              ValueType result_type,
+                              ValueType op1_type,
+                              ValueType op2_type,
+                              ValueType mask_type,
+                              int sew,
+                              VReg result_reg_class,
+                              VReg op1_reg_class,
+                              DAGOperand op2_kind> {
+  def : VPatBinaryMSwapped<intrinsic, inst, result_type, op1_type, op2_type,
+                           sew, op1_reg_class, op2_kind>;
   def : VPatBinaryMaskSwapped<intrinsic, inst, result_type, op1_type, op2_type,
                               mask_type, sew, result_reg_class, op1_reg_class,
                               op2_kind>;
@@ -5187,10 +5089,10 @@ multiclass VPatBinaryW_WV_RM<string intrinsic, string instruction,
                                            Wti.Vector, Vti.Vector, Vti.Mask,
                                            Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
       }
-      def : VPatBinaryMaskTARoundingMode<intrinsic, name,
-                                         Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
-                                         Vti.Log2SEW, Wti.RegClass,
-                                         Wti.RegClass, Vti.RegClass>;
+      def : VPatBinaryMaskPolicyRoundingMode<intrinsic, name,
+                                             Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+                                             Vti.Log2SEW, Wti.RegClass,
+                                             Wti.RegClass, Vti.RegClass>;
     }
   }
 }
@@ -5426,10 +5328,10 @@ multiclass VPatBinarySwappedM_VV<string intrinsic, string instruction,
                                  list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
     let Predicates = GetVTypePredicates<vti>.Predicates in
-    defm : VPatBinarySwapped<intrinsic, instruction # "_VV_" # vti.LMul.MX,
-                             vti.Mask, vti.Vector, vti.Vector, vti.Mask,
-                             vti.Log2SEW, VR,
-                             vti.RegClass, vti.RegClass>;
+    defm : VPatBinaryMSwapped<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+                              vti.Mask, vti.Vector, vti.Vector, vti.Mask,
+                              vti.Log2SEW, VR,
+                              vti.RegClass, vti.RegClass>;
 }
 
 multiclass VPatBinaryM_VX<string intrinsic, string instruction,
@@ -5461,7 +5363,7 @@ multiclass VPatBinaryV_VV_VX_VI<string intrinsic, string instruction,
       VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
 
 multiclass VPatBinaryV_VV_VX_VI_RM<string intrinsic, string instruction,
-                                   list<VTypeInfo> vtilist, Operand ImmType = simm5>
+                                   list<VTypeInfo> vtilist, Operand ImmType>
     : VPatBinaryV_VV_RM<intrinsic, instruction, vtilist>,
       VPatBinaryV_VX_RM<intrinsic, instruction, vtilist>,
       VPatBinaryV_VI_RM<intrinsic, instruction, vtilist, ImmType>;
@@ -5542,46 +5444,6 @@ multiclass VPatBinaryM_V_X<string intrinsic, string instruction>
     : VPatBinaryV_V<intrinsic, instruction>,
       VPatBinaryV_X<intrinsic, instruction>;
 
-multiclass VPatTernary<string intrinsic,
-                       string inst,
-                       string kind,
-                       ValueType result_type,
-                       ValueType op1_type,
-                       ValueType op2_type,
-                       ValueType mask_type,
-                       int sew,
-                       LMULInfo vlmul,
-                       VReg result_reg_class,
-                       RegisterClass op1_reg_class,
-                       DAGOperand op2_kind> {
-  def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
-                          sew, vlmul, result_reg_class, op1_reg_class,
-                          op2_kind>;
-  def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
-                        mask_type, sew, vlmul, result_reg_class, op1_reg_class,
-                        op2_kind>;
-}
-
-multiclass VPatTernaryNoMaskNoPolicy<string intrinsic,
-                                     string inst,
-                                     string kind,
-                                     ValueType result_type,
-                                     ValueType op1_type,
-                                     ValueType op2_type,
-                                     ValueType mask_type,
-                                     int sew,
-                                     LMULInfo vlmul,
-                                     VReg result_reg_class,
-                                     RegisterClass op1_reg_class,
-                                     DAGOperand op2_kind> {
-  def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
-                          sew, vlmul, result_reg_class, op1_reg_class,
-                          op2_kind>;
-  def : VPatTernaryMaskPolicy<intrinsic, inst, kind, result_type, op1_type, op2_type,
-                              mask_type, sew, vlmul, result_reg_class, op1_reg_class,
-                              op2_kind>;
-}
-
 multiclass VPatTernaryWithPolicy<string intrinsic,
                                  string inst,
                                  string kind,
@@ -5797,7 +5659,7 @@ multiclass VPatTernaryV_VV_VX_AAXA_RM<string intrinsic, string instruction,
       VPatTernaryV_VX_AAXA_RM<intrinsic, instruction, vtilist, isSEWAware>;
 
 multiclass VPatTernaryV_VX_VI<string intrinsic, string instruction,
-                              list<VTypeInfo> vtilist, Operand Imm_type = simm5>
+                              list<VTypeInfo> vtilist, Operand Imm_type>
     : VPatTernaryV_VX<intrinsic, instruction, vtilist>,
       VPatTernaryV_VI<intrinsic, instruction, vtilist, Imm_type>;
 
@@ -5829,7 +5691,7 @@ multiclass VPatBinaryM_VX_VI<string intrinsic, string instruction,
       VPatBinaryM_VI<intrinsic, instruction, vtilist>;
 
 multiclass VPatBinaryV_VV_VX_VI_INT<string intrinsic, string instruction,
-                                    list<VTypeInfo> vtilist, Operand ImmType = simm5>
+                                    list<VTypeInfo> vtilist, Operand ImmType>
     : VPatBinaryV_VV_INT<intrinsic#"_vv", instruction, vtilist>,
       VPatBinaryV_VX_INT<intrinsic#"_vx", instruction, vtilist>,
       VPatBinaryV_VI<intrinsic#"_vx", instruction, vtilist, ImmType>;
@@ -6332,12 +6194,12 @@ defm PseudoVSEXT_VF8 : PseudoVEXT_VF8;
 // 11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
 //===----------------------------------------------------------------------===//
 defm PseudoVADC  : VPseudoVCALU_VM_XM_IM;
-defm PseudoVMADC : VPseudoVCALUM_VM_XM_IM<"@earlyclobber $rd">;
-defm PseudoVMADC : VPseudoVCALUM_V_X_I<"@earlyclobber $rd">;
+defm PseudoVMADC : VPseudoVCALUM_VM_XM_IM;
+defm PseudoVMADC : VPseudoVCALUM_V_X_I;
 
 defm PseudoVSBC  : VPseudoVCALU_VM_XM;
-defm PseudoVMSBC : VPseudoVCALUM_VM_XM<"@earlyclobber $rd">;
-defm PseudoVMSBC : VPseudoVCALUM_V_X<"@earlyclobber $rd">;
+defm PseudoVMSBC : VPseudoVCALUM_VM_XM;
+defm PseudoVMSBC : VPseudoVCALUM_V_X;
 
 //===----------------------------------------------------------------------===//
 // 11.5. Vector Bitwise Logical Instructions
@@ -6349,9 +6211,9 @@ defm PseudoVXOR : VPseudoVALU_VV_VX_VI<Commutable=1>;
 //===----------------------------------------------------------------------===//
 // 11.6. Vector Single-Width Bit Shift Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVSLL : VPseudoVSHT_VV_VX_VI<uimm5>;
-defm PseudoVSRL : VPseudoVSHT_VV_VX_VI<uimm5>;
-defm PseudoVSRA : VPseudoVSHT_VV_VX_VI<uimm5>;
+defm PseudoVSLL : VPseudoVSHT_VV_VX_VI;
+defm PseudoVSRL : VPseudoVSHT_VV_VX_VI;
+defm PseudoVSRA : VPseudoVSHT_VV_VX_VI;
 
 //===----------------------------------------------------------------------===//
 // 11.7. Vector Narrowing Integer Right Shift Instructions
@@ -6460,8 +6322,8 @@ let Defs = [VXSAT] in {
 //===----------------------------------------------------------------------===//
 // 12.4. Vector Single-Width Scaling Shift Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVSSRL : VPseudoVSSHT_VV_VX_VI_RM<uimm5>;
-defm PseudoVSSRA : VPseudoVSSHT_VV_VX_VI_RM<uimm5>;
+defm PseudoVSSRL : VPseudoVSSHT_VV_VX_VI_RM;
+defm PseudoVSSRA : VPseudoVSSHT_VV_VX_VI_RM;
 
 //===----------------------------------------------------------------------===//
 // 12.5. Vector Narrowing Fixed-Point Clip Instructions
@@ -6846,8 +6708,8 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 // 16.3. Vector Slide Instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasVInstructions] in {
-  defm PseudoVSLIDEUP    : VPseudoVSLD_VX_VI<uimm5, /*slidesUp=*/true, "@earlyclobber $rd">;
-  defm PseudoVSLIDEDOWN  : VPseudoVSLD_VX_VI<uimm5, /*slidesUp=*/false>;
+  defm PseudoVSLIDEUP    : VPseudoVSLD_VX_VI</*slidesUp=*/true, "@earlyclobber $rd">;
+  defm PseudoVSLIDEDOWN  : VPseudoVSLD_VX_VI</*slidesUp=*/false>;
   defm PseudoVSLIDE1UP   : VPseudoVSLD1_VX<"@earlyclobber $rd">;
   defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX;
 } // Predicates = [HasVInstructions]
@@ -6861,8 +6723,8 @@ let Predicates = [HasVInstructionsAnyF] in {
 // 16.4. Vector Register Gather Instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasVInstructions] in {
-defm PseudoVRGATHER     : VPseudoVGTR_VV_VX_VI<uimm5, "@earlyclobber $rd">;
-defm PseudoVRGATHEREI16 : VPseudoVGTR_EI16_VV<Constraint = "@earlyclobber $rd">;
+defm PseudoVRGATHER     : VPseudoVGTR_VV_VX_VI;
+defm PseudoVRGATHEREI16 : VPseudoVGTR_EI16_VV;
 
 //===----------------------------------------------------------------------===//
 // 16.5. Vector Compress Instruction
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index d091077..75fcc1e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -247,23 +247,21 @@ class VPseudoTernaryNoMask_Zvk<VReg RetClass,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-multiclass VPseudoBinaryNoMaskTU_Zvk<VReg RetClass,
-                                     VReg Op1Class,
-                                     DAGOperand Op2Class,
-                                     LMULInfo MInfo,
-                                     string Constraint = "",
-                                     int sew = 0> {
-  let VLMul = MInfo.value, SEW=sew in {
-    defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX);
-    def suffix : VPseudoBinaryNoMaskTU<RetClass, Op1Class, Op2Class,
-                                       Constraint>;
+multiclass VPseudoBinaryNoMaskPolicy_Zvk<VReg RetClass,
+                                         VReg Op1Class,
+                                         DAGOperand Op2Class,
+                                         LMULInfo MInfo,
+                                         string Constraint = ""> {
+  let VLMul = MInfo.value in {
+    def "_" # MInfo.MX : VPseudoBinaryNoMaskPolicy<RetClass, Op1Class, Op2Class,
+                                           Constraint>;
   }
 }
 
 multiclass VPseudoTernaryNoMask_Zvk<VReg RetClass,
-                                   VReg Op1Class,
-                                   DAGOperand Op2Class,
-                                   LMULInfo MInfo> {
+                                    VReg Op1Class,
+                                    DAGOperand Op2Class,
+                                    LMULInfo MInfo> {
   let VLMul = MInfo.value in
     def "_" # MInfo.MX : VPseudoTernaryNoMask_Zvk<RetClass, Op1Class, Op2Class>;
 }
@@ -349,7 +347,7 @@ multiclass VPseudoVSHA2MS {
 multiclass VPseudoVAESKF1 {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
-    defm _VI : VPseudoBinaryNoMaskTU_Zvk<m.vrclass, m.vrclass, uimm5, m>,
+    defm _VI : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, uimm5, m>,
                SchedBinary<"WriteVAESKF1V", "ReadVAESKF1V", "ReadVAESKF1V", mx,
                            forceMergeOpRead=true>;
   }
@@ -384,7 +382,7 @@ multiclass VPseudoVSM3C {
 multiclass VPseudoVSM4K {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
-    defm _VI : VPseudoBinaryNoMaskTU_Zvk<m.vrclass, m.vrclass, uimm5, m>,
+    defm _VI : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, uimm5, m>,
                SchedBinary<"WriteVSM4KV", "ReadVSM4KV", "ReadVSM4KV", mx,
                            forceMergeOpRead=true>;
   }
@@ -393,7 +391,7 @@ multiclass VPseudoVSM4K {
 multiclass VPseudoVSM3ME {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
-    defm _VV : VPseudoBinaryNoMaskTU_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
+    defm _VV : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
                SchedBinary<"WriteVSM3MEV", "ReadVSM3MEV", "ReadVSM3MEV", mx,
                            forceMergeOpRead=true>;
   }
@@ -452,16 +450,16 @@ multiclass VPseudoVCPOP {
   }
 }
 
-multiclass VPseudoVWALU_VV_VX_VI<Operand ImmType> {
+multiclass VPseudoVWSLL {
   foreach m = MxListW in {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_VV<m>,
               SchedBinary<"WriteVWSLLV", "ReadVWSLLV", "ReadVWSLLV", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryW_VX<m>, 
+    defm "" : VPseudoBinaryW_VX<m>,
               SchedBinary<"WriteVWSLLX", "ReadVWSLLV", "ReadVWSLLX", mx,
                           forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryW_VI<ImmType, m>,
+    defm "" : VPseudoBinaryW_VI<uimm5, m>,
               SchedUnary<"WriteVWSLLI", "ReadVWSLLV", mx,
                          forceMergeOpRead=true>;
   }
@@ -494,7 +492,7 @@ multiclass VPseudoVREV8 {
   }
 }
 
-multiclass VPseudoVROL {
+multiclass VPseudoVROT_VV_VX {
  foreach m = MxList in {
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVRotV", "ReadVRotV", "ReadVRotV", m.MX,
@@ -505,18 +503,12 @@ multiclass VPseudoVROL {
   }
 }
 
-multiclass VPseudoVROR<Operand ImmType> {
-  defvar Constraint = "";
+multiclass VPseudoVROT_VV_VX_VI
+   : VPseudoVROT_VV_VX {
   foreach m = MxList in {
-    defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m>,
-              SchedBinary<"WriteVRotV", "ReadVRotV", "ReadVRotV", mx,
-                          forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VX<m>,
-              SchedBinary<"WriteVRotX", "ReadVRotV", "ReadVRotX", mx,
-                          forceMergeOpRead=true>;
-    defm "" : VPseudoBinaryV_VI<ImmType, m>,
-              SchedUnary<"WriteVRotI", "ReadVRotV", mx, forceMergeOpRead=true>;
+    defm "" : VPseudoBinaryV_VI<uimm6, m>,
+              SchedUnary<"WriteVRotI", "ReadVRotV", m.MX,
+                         forceMergeOpRead=true>;
   }
 }
 
@@ -525,7 +517,7 @@ let Predicates = [HasStdExtZvbb] in {
   defm PseudoVCLZ   : VPseudoVCLZ;
   defm PseudoVCTZ   : VPseudoVCTZ;
   defm PseudoVCPOP  : VPseudoVCPOP;
-  defm PseudoVWSLL : VPseudoVWALU_VV_VX_VI<uimm5>;
+  defm PseudoVWSLL : VPseudoVWSLL;
 } // Predicates = [HasStdExtZvbb]
 
 let Predicates = [HasStdExtZvbc] in {
@@ -537,8 +529,8 @@ let Predicates = [HasStdExtZvkb] in {
   defm PseudoVANDN  : VPseudoVANDN;
   defm PseudoVBREV8 : VPseudoVBREV8;
   defm PseudoVREV8  : VPseudoVREV8;
-  defm PseudoVROL   : VPseudoVROL;
-  defm PseudoVROR   : VPseudoVROR<uimm6>;
+  defm PseudoVROL   : VPseudoVROT_VV_VX;
+  defm PseudoVROR   : VPseudoVROT_VV_VX_VI;
 } // Predicates = [HasStdExtZvkb]
 
 let Predicates = [HasStdExtZvkg] in {
@@ -1031,45 +1023,32 @@ multiclass VPatBinaryV_VV_VX_VROL<string intrinsic, string instruction,
       VPatBinaryV_VI_VROL<intrinsic, instruction2, vtilist>;
 
 multiclass VPatBinaryV_VV_VX_VI_VROR<string intrinsic, string instruction,
-                                     list<VTypeInfo> vtilist,
-                                     Operand ImmType = uimm6>
+                                     list<VTypeInfo> vtilist>
     : VPatBinaryV_VV<intrinsic, instruction, vtilist>,
       VPatBinaryV_VX_VROTATE<intrinsic, instruction, vtilist>,
-      VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
-
-multiclass VPatBinaryW_VI_VWSLL<string intrinsic, string instruction,
-                                list<VTypeInfoToWide> vtilist> {
-  foreach VtiToWti = vtilist in {
-    defvar Vti = VtiToWti.Vti;
-    defvar Wti = VtiToWti.Wti;
-    defm : VPatBinary<intrinsic, instruction # "_VI_" # Vti.LMul.MX,
-                      Wti.Vector, Vti.Vector, XLenVT, Vti.Mask,
-                      Vti.Log2SEW, Wti.RegClass,
-                      Vti.RegClass, uimm5>;
-  }
-}
+      VPatBinaryV_VI<intrinsic, instruction, vtilist, uimm6>;
 
-multiclass VPatBinaryW_VX_VWSLL<string intrinsic, string instruction,
-                                list<VTypeInfoToWide> vtilist> {
+multiclass VPatBinaryW_VV_VX_VI_VWSLL<string intrinsic, string instruction,
+                                      list<VTypeInfoToWide> vtilist>
+    : VPatBinaryW_VV<intrinsic, instruction, vtilist> {
   foreach VtiToWti = vtilist in {
     defvar Vti = VtiToWti.Vti;
     defvar Wti = VtiToWti.Wti;
     defvar kind = "V"#Vti.ScalarSuffix;
     let Predicates = !listconcat(GetVTypePredicates<Vti>.Predicates,
-                                 GetVTypePredicates<Wti>.Predicates) in
-    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
-                      Wti.Vector, Vti.Vector, XLenVT, Vti.Mask,
-                      Vti.Log2SEW, Wti.RegClass,
-                      Vti.RegClass, Vti.ScalarRegClass>;
+                                 GetVTypePredicates<Wti>.Predicates) in {
+      defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+                        Wti.Vector, Vti.Vector, XLenVT, Vti.Mask,
+                        Vti.Log2SEW, Wti.RegClass,
+                        Vti.RegClass, Vti.ScalarRegClass>;
+      defm : VPatBinary<intrinsic, instruction # "_VI_" # Vti.LMul.MX,
+                        Wti.Vector, Vti.Vector, XLenVT, Vti.Mask,
+                        Vti.Log2SEW, Wti.RegClass,
+                        Vti.RegClass, uimm5>;
+    }
   }
 }
 
-multiclass VPatBinaryW_VV_VX_VI_VWSLL<string intrinsic, string instruction,
-                                      list<VTypeInfoToWide> vtilist>
-    : VPatBinaryW_VV<intrinsic, instruction, vtilist>,
-      VPatBinaryW_VX_VWSLL<intrinsic, instruction, vtilist>,
-      VPatBinaryW_VI_VWSLL<intrinsic, instruction, vtilist>;
-
 let Predicates = [HasStdExtZvbb] in {
   defm : VPatUnaryV_V<"int_riscv_vbrev", "PseudoVBREV", AllIntegerVectors>;
   defm : VPatUnaryV_V<"int_riscv_vclz", "PseudoVCLZ", AllIntegerVectors>;
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 185b2fe..f0a3a4e 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -91,6 +91,8 @@ class SparcAsmParser : public MCTargetAsmParser {
 
   ParseStatus parseASITag(OperandVector &Operands);
 
+  ParseStatus parsePrefetchTag(OperandVector &Operands);
+
   template <TailRelocKind Kind>
   ParseStatus parseTailRelocSym(OperandVector &Operands);
 
@@ -209,7 +211,8 @@ private:
     k_Immediate,
     k_MemoryReg,
     k_MemoryImm,
-    k_ASITag
+    k_ASITag,
+    k_PrefetchTag,
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -240,6 +243,7 @@ private:
     struct ImmOp Imm;
     struct MemOp Mem;
     unsigned ASI;
+    unsigned Prefetch;
   };
 
 public:
@@ -253,6 +257,7 @@ public:
   bool isMEMri() const { return Kind == k_MemoryImm; }
   bool isMembarTag() const { return Kind == k_Immediate; }
   bool isASITag() const { return Kind == k_ASITag; }
+  bool isPrefetchTag() const { return Kind == k_PrefetchTag; }
   bool isTailRelocSym() const { return Kind == k_Immediate; }
 
   bool isCallTarget() const {
@@ -337,6 +342,11 @@ public:
     return ASI;
   }
 
+  unsigned getPrefetchTag() const {
+    assert((Kind == k_PrefetchTag) && "Invalid access!");
+    return Prefetch;
+  }
+
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override {
     return StartLoc;
@@ -360,6 +370,9 @@ public:
     case k_ASITag:
       OS << "ASI tag: " << getASITag() << "\n";
       break;
+    case k_PrefetchTag:
+      OS << "Prefetch tag: " << getPrefetchTag() << "\n";
+      break;
     }
   }
 
@@ -416,6 +429,11 @@ public:
     Inst.addOperand(MCOperand::createImm(getASITag()));
   }
 
+  void addPrefetchTagOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getPrefetchTag()));
+  }
+
   void addMembarTagOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCExpr *Expr = getImm();
@@ -469,6 +487,15 @@ public:
     return Op;
   }
 
+  static std::unique_ptr<SparcOperand> CreatePrefetchTag(unsigned Val, SMLoc S,
+                                                         SMLoc E) {
+    auto Op = std::make_unique<SparcOperand>(k_PrefetchTag);
+    Op->Prefetch = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static bool MorphToIntPairReg(SparcOperand &Op) {
     unsigned Reg = Op.getReg();
     assert(Op.Reg.Kind == rk_IntReg);
@@ -1088,6 +1115,44 @@ ParseStatus SparcAsmParser::parseASITag(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
+ParseStatus SparcAsmParser::parsePrefetchTag(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = Parser.getTok().getEndLoc();
+  int64_t PrefetchVal = 0;
+
+  switch (getLexer().getKind()) {
+  case AsmToken::LParen:
+  case AsmToken::Integer:
+  case AsmToken::Identifier:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Tilde:
+    if (getParser().parseAbsoluteExpression(PrefetchVal) ||
+        !isUInt<5>(PrefetchVal))
+      return Error(S, "invalid prefetch number, must be between 0 and 31");
+    break;
+  case AsmToken::Hash: {
+    SMLoc TagStart = getLexer().peekTok(false).getLoc();
+    Parser.Lex(); // Eat the '#'.
+    const StringRef PrefetchName = Parser.getTok().getString();
+    const SparcPrefetchTag::PrefetchTag *PrefetchTag =
+        SparcPrefetchTag::lookupPrefetchTagByName(PrefetchName);
+    Parser.Lex(); // Eat the identifier token.
+
+    if (!PrefetchTag)
+      return Error(TagStart, "unknown prefetch tag");
+
+    PrefetchVal = PrefetchTag->Encoding;
+    break;
+  }
+  default:
+    return ParseStatus::NoMatch;
+  }
+
+  Operands.push_back(SparcOperand::CreatePrefetchTag(PrefetchVal, S, E));
+  return ParseStatus::Success;
+}
+
 ParseStatus SparcAsmParser::parseCallTarget(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
@@ -1384,12 +1449,11 @@ MCRegister SparcAsmParser::matchRegisterName(const AsmToken &Tok,
   }
 
   // JPS1 extension - aliases for ASRs
-  // Section A.51 - Read State Register
+  // Section 5.2.11 - Ancillary State Registers (ASRs)
   if (Name == "pcr") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR16;
   }
-
   if (Name == "pic") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR17;
@@ -1402,6 +1466,14 @@ MCRegister SparcAsmParser::matchRegisterName(const AsmToken &Tok,
     RegKind = SparcOperand::rk_Special;
     return SP::ASR19;
   }
+  if (Name == "set_softint") {
+    RegKind = SparcOperand::rk_Special;
+    return SP::ASR20;
+  }
+  if (Name == "clear_softint") {
+    RegKind = SparcOperand::rk_Special;
+    return SP::ASR21;
+  }
   if (Name == "softint") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR22;
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 240f539..cb7414f 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -323,9 +323,11 @@ namespace {
 
     bool writeNopData(raw_ostream &OS, uint64_t Count,
                       const MCSubtargetInfo *STI) const override {
-      // Cannot emit NOP with size not multiple of 32 bits.
-      if (Count % 4 != 0)
-        return false;
+
+      // If the count is not 4-byte aligned, we must be writing data into the
+      // text section (otherwise we have unaligned instructions, and thus have
+      // far bigger problems), so just write zeros instead.
+      OS.write_zeros(Count % 4);
 
       uint64_t NumNops = Count / 4;
       for (uint64_t i = 0; i != NumNops; ++i)
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index ef77648..5b407a8 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -253,3 +253,14 @@ void SparcInstPrinter::printASITag(const MCInst *MI, int opNum,
   else
     O << Imm;
 }
+
+void SparcInstPrinter::printPrefetchTag(const MCInst *MI, int opNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(opNum).getImm();
+  auto PrefetchTag = SparcPrefetchTag::lookupPrefetchTagByEncoding(Imm);
+  if (PrefetchTag)
+    O << '#' << PrefetchTag->Name;
+  else
+    O << Imm;
+}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
index cb691a3..207a970 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
@@ -56,6 +56,8 @@ public:
                       raw_ostream &O);
   void printASITag(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printPrefetchTag(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 522a887..4688837 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -93,44 +93,46 @@ bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
 SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
 {
   return StringSwitch<SparcMCExpr::VariantKind>(name)
-    .Case("lo",  VK_Sparc_LO)
-    .Case("hi",  VK_Sparc_HI)
-    .Case("h44", VK_Sparc_H44)
-    .Case("m44", VK_Sparc_M44)
-    .Case("l44", VK_Sparc_L44)
-    .Case("hh",  VK_Sparc_HH)
-    .Case("hm",  VK_Sparc_HM)
-    .Case("lm",  VK_Sparc_LM)
-    .Case("pc22",  VK_Sparc_PC22)
-    .Case("pc10",  VK_Sparc_PC10)
-    .Case("got22", VK_Sparc_GOT22)
-    .Case("got10", VK_Sparc_GOT10)
-    .Case("got13", VK_Sparc_GOT13)
-    .Case("r_disp32",   VK_Sparc_R_DISP32)
-    .Case("tgd_hi22",   VK_Sparc_TLS_GD_HI22)
-    .Case("tgd_lo10",   VK_Sparc_TLS_GD_LO10)
-    .Case("tgd_add",    VK_Sparc_TLS_GD_ADD)
-    .Case("tgd_call",   VK_Sparc_TLS_GD_CALL)
-    .Case("tldm_hi22",  VK_Sparc_TLS_LDM_HI22)
-    .Case("tldm_lo10",  VK_Sparc_TLS_LDM_LO10)
-    .Case("tldm_add",   VK_Sparc_TLS_LDM_ADD)
-    .Case("tldm_call",  VK_Sparc_TLS_LDM_CALL)
-    .Case("tldo_hix22", VK_Sparc_TLS_LDO_HIX22)
-    .Case("tldo_lox10", VK_Sparc_TLS_LDO_LOX10)
-    .Case("tldo_add",   VK_Sparc_TLS_LDO_ADD)
-    .Case("tie_hi22",   VK_Sparc_TLS_IE_HI22)
-    .Case("tie_lo10",   VK_Sparc_TLS_IE_LO10)
-    .Case("tie_ld",     VK_Sparc_TLS_IE_LD)
-    .Case("tie_ldx",    VK_Sparc_TLS_IE_LDX)
-    .Case("tie_add",    VK_Sparc_TLS_IE_ADD)
-    .Case("tle_hix22",  VK_Sparc_TLS_LE_HIX22)
-    .Case("tle_lox10",  VK_Sparc_TLS_LE_LOX10)
-    .Case("hix",        VK_Sparc_HIX22)
-    .Case("lox",        VK_Sparc_LOX10)
-    .Case("gdop_hix22", VK_Sparc_GOTDATA_HIX22)
-    .Case("gdop_lox10", VK_Sparc_GOTDATA_LOX10)
-    .Case("gdop",       VK_Sparc_GOTDATA_OP)
-    .Default(VK_Sparc_None);
+      .Case("lo", VK_Sparc_LO)
+      .Case("hi", VK_Sparc_HI)
+      .Case("h44", VK_Sparc_H44)
+      .Case("m44", VK_Sparc_M44)
+      .Case("l44", VK_Sparc_L44)
+      .Case("hh", VK_Sparc_HH)
+      .Case("uhi", VK_Sparc_HH) // Nonstandard GNU extension
+      .Case("hm", VK_Sparc_HM)
+      .Case("ulo", VK_Sparc_HM) // Nonstandard GNU extension
+      .Case("lm", VK_Sparc_LM)
+      .Case("pc22", VK_Sparc_PC22)
+      .Case("pc10", VK_Sparc_PC10)
+      .Case("got22", VK_Sparc_GOT22)
+      .Case("got10", VK_Sparc_GOT10)
+      .Case("got13", VK_Sparc_GOT13)
+      .Case("r_disp32", VK_Sparc_R_DISP32)
+      .Case("tgd_hi22", VK_Sparc_TLS_GD_HI22)
+      .Case("tgd_lo10", VK_Sparc_TLS_GD_LO10)
+      .Case("tgd_add", VK_Sparc_TLS_GD_ADD)
+      .Case("tgd_call", VK_Sparc_TLS_GD_CALL)
+      .Case("tldm_hi22", VK_Sparc_TLS_LDM_HI22)
+      .Case("tldm_lo10", VK_Sparc_TLS_LDM_LO10)
+      .Case("tldm_add", VK_Sparc_TLS_LDM_ADD)
+      .Case("tldm_call", VK_Sparc_TLS_LDM_CALL)
+      .Case("tldo_hix22", VK_Sparc_TLS_LDO_HIX22)
+      .Case("tldo_lox10", VK_Sparc_TLS_LDO_LOX10)
+      .Case("tldo_add", VK_Sparc_TLS_LDO_ADD)
+      .Case("tie_hi22", VK_Sparc_TLS_IE_HI22)
+      .Case("tie_lo10", VK_Sparc_TLS_IE_LO10)
+      .Case("tie_ld", VK_Sparc_TLS_IE_LD)
+      .Case("tie_ldx", VK_Sparc_TLS_IE_LDX)
+      .Case("tie_add", VK_Sparc_TLS_IE_ADD)
+      .Case("tle_hix22", VK_Sparc_TLS_LE_HIX22)
+      .Case("tle_lox10", VK_Sparc_TLS_LE_LOX10)
+      .Case("hix", VK_Sparc_HIX22)
+      .Case("lox", VK_Sparc_LOX10)
+      .Case("gdop_hix22", VK_Sparc_GOTDATA_HIX22)
+      .Case("gdop_lox10", VK_Sparc_GOTDATA_LOX10)
+      .Case("gdop", VK_Sparc_GOTDATA_OP)
+      .Default(VK_Sparc_None);
 }
 
 Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index fb634cc..ad6ca09 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -26,6 +26,11 @@ namespace SparcASITag {
 #define GET_ASITagsList_IMPL
 #include "SparcGenSearchableTables.inc"
 } // end namespace SparcASITag
+
+namespace SparcPrefetchTag {
+#define GET_PrefetchTagsList_IMPL
+#include "SparcGenSearchableTables.inc"
+} // end namespace SparcPrefetchTag
 } // end namespace llvm
 
 using namespace llvm;
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index fd76627..a2a9f74 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -48,6 +48,17 @@ struct ASITag {
 #define GET_ASITagsList_DECL
 #include "SparcGenSearchableTables.inc"
 } // end namespace SparcASITag
+
+// Defines symbolic names for Sparc v9 prefetch tag names.
+namespace SparcPrefetchTag {
+struct PrefetchTag {
+  const char *Name;
+  unsigned Encoding;
+};
+
+#define GET_PrefetchTagsList_DECL
+#include "SparcGenSearchableTables.inc"
+} // end namespace SparcPrefetchTag
 } // End llvm namespace
 
 // Defines symbolic names for Sparc registers.  This defines a mapping from
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 45cf985..65f372f 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
+include "llvm/TableGen/SearchableTable.td"
 
 //===----------------------------------------------------------------------===//
 // SPARC Subtarget features.
@@ -91,6 +92,7 @@ foreach i = 0 ... 5 in
 //===----------------------------------------------------------------------===//
 
 include "SparcASITags.td"
+include "SparcPrefetchTags.td"
 include "SparcRegisterInfo.td"
 include "SparcCallingConv.td"
 include "SparcSchedule.td"
diff --git a/llvm/lib/Target/Sparc/SparcASITags.td b/llvm/lib/Target/Sparc/SparcASITags.td
index 115e41b..4b2d17b 100644
--- a/llvm/lib/Target/Sparc/SparcASITags.td
+++ b/llvm/lib/Target/Sparc/SparcASITags.td
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-include "llvm/TableGen/SearchableTable.td"
-
 class ASITag<string name, string alt_name, bits<8> op> {
   string Name = name;
   // A maximum of one alias is supported right now.
diff --git a/llvm/lib/Target/Sparc/SparcInstrAliases.td b/llvm/lib/Target/Sparc/SparcInstrAliases.td
index db4c05c..eedad25 100644
--- a/llvm/lib/Target/Sparc/SparcInstrAliases.td
+++ b/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -400,9 +400,11 @@ defm : cp_cond_alias<"012",   0b1111>;
 let EmitPriority = 0 in defm : cp_cond_alias<"",      0b1000>; // same as a; gnu asm, not in manual
 
 defm : reg_cond_alias<"z",    0b001>;
+defm : reg_cond_alias<"e",    0b001>;
 defm : reg_cond_alias<"lez",  0b010>;
 defm : reg_cond_alias<"lz",   0b011>;
 defm : reg_cond_alias<"nz",   0b101>;
+defm : reg_cond_alias<"ne",   0b101>;
 defm : reg_cond_alias<"gz",   0b110>;
 defm : reg_cond_alias<"gez",  0b111>;
 
@@ -560,11 +562,16 @@ def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, simm13Op:$simm13), 0>;
 
 // End of Section A.3
 
-// or imm, reg, rd -> or reg, imm, rd
-// Nonstandard GNU extension.
-let EmitPriority = 0 in
+
+// Nonstandard GNU extensions.
+let EmitPriority = 0 in {
+  // or imm, reg, rd -> or reg, imm, rd
   def : InstAlias<"or $simm13, $rs1, $rd", (ORri IntRegs:$rd, IntRegs:$rs1, simm13Op:$simm13)>;
 
+  // addc/addx imm, reg, rd -> or reg, imm, rd
+  def : InstAlias<"addx $simm13, $rs1, $rd", (ADDCri IntRegs:$rd, IntRegs:$rs1, simm13Op:$simm13)>;
+}
+
 // wr reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
 // (aka: omit the first arg when it's g0. This is not in the manual, but is
 // supported by gnu and solaris as)
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 4d68f93..cac96a1 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -197,6 +197,16 @@ def ASITag : Operand<i32> {
   let ParserMatchClass = SparcASITagAsmOperand;
 }
 
+def SparcPrefetchTagAsmOperand : AsmOperandClass {
+  let Name = "PrefetchTag";
+  let ParserMethod = "parsePrefetchTag";
+}
+
+def PrefetchTag : Operand<i32> {
+  let PrintMethod = "printPrefetchTag";
+  let ParserMatchClass = SparcPrefetchTagAsmOperand;
+}
+
 // Branch targets have OtherVT type.
 def brtarget : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
@@ -1767,11 +1777,18 @@ let Predicates = [HasV9], rs1 = 0, rs2 = 0 in {
 // Section A.42 - Prefetch Data
 let Predicates = [HasV9] in {
   def PREFETCHr : F3_1<3, 0b101101,
-                   (outs), (ins (MEMrr $rs1, $rs2):$addr, shift_imm5:$rd),
+                   (outs), (ins (MEMrr $rs1, $rs2):$addr, PrefetchTag:$rd),
                    "prefetch [$addr], $rd", []>;
   def PREFETCHi : F3_2<3, 0b101101,
-                   (outs), (ins (MEMri $rs1, $simm13):$addr, shift_imm5:$rd),
+                   (outs), (ins (MEMri $rs1, $simm13):$addr, PrefetchTag:$rd),
                    "prefetch [$addr], $rd", []>;
+  def PREFETCHAr : F3_1_asi<3, 0b111101, (outs),
+                    (ins (MEMrr $rs1, $rs2):$addr, ASITag:$asi, PrefetchTag:$rd),
+                    "prefetcha [$addr] $asi, $rd", []>;
+  let Uses = [ASR3] in
+  def PREFETCHAi : F3_2<3, 0b111101, (outs),
+                    (ins (MEMri $rs1, $simm13):$addr, PrefetchTag:$rd),
+                    "prefetcha [$addr] %asi, $rd", []>;
 }
 
 
diff --git a/llvm/lib/Target/Sparc/SparcPrefetchTags.td b/llvm/lib/Target/Sparc/SparcPrefetchTags.td
new file mode 100644
index 0000000..0104f47
--- /dev/null
+++ b/llvm/lib/Target/Sparc/SparcPrefetchTags.td
@@ -0,0 +1,41 @@
+//===- SparcPrefetchTags.td --------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the symbolic operands permitted for various kinds of
+// SPARCv9 prefetches.
+//
+//===----------------------------------------------------------------------===//
+
+class PrefetchTag<string name, bits<8> op> {
+  string Name = name;
+  bits<8> Encoding = op;
+}
+
+def PrefetchTagsList : GenericTable {
+  let FilterClass = "PrefetchTag";
+  let Fields = ["Name", "Encoding"];
+
+  let PrimaryKey = [ "Encoding" ];
+  let PrimaryKeyName = "lookupPrefetchTagByEncoding";
+}
+
+def lookupPrefetchTagByName : SearchIndex {
+  let Table = PrefetchTagsList;
+  let Key = [ "Name" ];
+}
+
+def : PrefetchTag<"n_reads", 0x0>;
+def : PrefetchTag<"one_read", 0x1>;
+def : PrefetchTag<"n_writes", 0x2>;
+def : PrefetchTag<"one_write", 0x3>;
+def : PrefetchTag<"page", 0x4>;
+def : PrefetchTag<"unified", 0x11>;
+def : PrefetchTag<"n_reads_strong", 0x14>;
+def : PrefetchTag<"one_read_strong", 0x15>;
+def : PrefetchTag<"n_writes_strong", 0x16>;
+def : PrefetchTag<"one_write_strong", 0x17>;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 270dd32..6fb6e16 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1125,7 +1125,9 @@ multiclass ATOMIC_LOAD_FP_BINOP_MI<string Name, SDNode op> {
             Requires<[HasAVX512]>;
 }
 defm : ATOMIC_LOAD_FP_BINOP_MI<"ADD", fadd>;
-// FIXME: Add fsub, fmul, fdiv, ...
+defm : ATOMIC_LOAD_FP_BINOP_MI<"SUB", fsub>;
+defm : ATOMIC_LOAD_FP_BINOP_MI<"MUL", fmul>;
+defm : ATOMIC_LOAD_FP_BINOP_MI<"DIV", fdiv>;
 
 multiclass RELEASE_UNOP<string Name, dag dag8, dag dag16, dag dag32,
                         dag dag64> {
@@ -1570,21 +1572,21 @@ let Predicates = [HasNDD] in {
 }
 
 // Depositing value to 8/16 bit subreg:
-def : Pat<(or (and GR64:$dst, -256), 
+def : Pat<(or (and GR64:$dst, -256),
               (i64 (zextloadi8 addr:$src))),
-          (INSERT_SUBREG (i64 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+          (INSERT_SUBREG (i64 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>;
 
-def : Pat<(or (and GR32:$dst, -256), 
+def : Pat<(or (and GR32:$dst, -256),
               (i32 (zextloadi8 addr:$src))),
-          (INSERT_SUBREG (i32 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>;
 
-def : Pat<(or (and GR64:$dst, -65536), 
+def : Pat<(or (and GR64:$dst, -65536),
               (i64 (zextloadi16 addr:$src))),
           (INSERT_SUBREG (i64 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>;
 
-def : Pat<(or (and GR32:$dst, -65536), 
+def : Pat<(or (and GR32:$dst, -65536),
               (i32 (zextloadi16 addr:$src))),
-          (INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>; 
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>;
 
 // To avoid needing to materialize an immediate in a register, use a 32-bit and
 // with implicit zero-extension instead of a 64-bit and if the immediate has at
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 496a7e6..c4da0e5 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defining the misc X86 instructions.
-// 
+//
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index ca356ec..d1cc306 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -122,6 +122,14 @@ AArch64::parseArchExtension(StringRef ArchExt) {
   return {};
 }
 
+std::optional<AArch64::ExtensionInfo>
+AArch64::targetFeatureToExtension(StringRef TargetFeature) {
+  for (const auto &E : Extensions)
+    if (TargetFeature == E.Feature)
+      return E;
+  return {};
+}
+
 std::optional<AArch64::CpuInfo> AArch64::parseCpu(StringRef Name) {
   // Resolve aliases first.
   Name = resolveCPUAlias(Name);
@@ -213,21 +221,6 @@ void AArch64::ExtensionSet::disable(ArchExtKind E) {
       disable(Dep.Later);
 }
 
-void AArch64::ExtensionSet::toLLVMFeatureList(
-    std::vector<StringRef> &Features) const {
-  if (BaseArch && !BaseArch->ArchFeature.empty())
-    Features.push_back(BaseArch->ArchFeature);
-
-  for (const auto &E : Extensions) {
-    if (E.Feature.empty() || !Touched.test(E.ID))
-      continue;
-    if (Enabled.test(E.ID))
-      Features.push_back(E.Feature);
-    else
-      Features.push_back(E.NegFeature);
-  }
-}
-
 void AArch64::ExtensionSet::addCPUDefaults(const CpuInfo &CPU) {
   LLVM_DEBUG(llvm::dbgs() << "addCPUDefaults(" << CPU.Name << ")\n");
   BaseArch = &CPU.Arch;
@@ -247,11 +240,18 @@ void AArch64::ExtensionSet::addArchDefaults(const ArchInfo &Arch) {
       enable(E.ID);
 }
 
-bool AArch64::ExtensionSet::parseModifier(StringRef Modifier) {
+bool AArch64::ExtensionSet::parseModifier(StringRef Modifier,
+                                          const bool AllowNoDashForm) {
   LLVM_DEBUG(llvm::dbgs() << "parseModifier(" << Modifier << ")\n");
 
-  bool IsNegated = Modifier.starts_with("no");
-  StringRef ArchExt = IsNegated ? Modifier.drop_front(2) : Modifier;
+  size_t NChars = 0;
+  // The "no-feat" form is allowed in the target attribute but nowhere else.
+  if (AllowNoDashForm && Modifier.starts_with("no-"))
+    NChars = 3;
+  else if (Modifier.starts_with("no"))
+    NChars = 2;
+  bool IsNegated = NChars != 0;
+  StringRef ArchExt = Modifier.drop_front(NChars);
 
   if (auto AE = parseArchExtension(ArchExt)) {
     if (AE->Feature.empty() || AE->NegFeature.empty())
@@ -265,6 +265,21 @@ bool AArch64::ExtensionSet::parseModifier(StringRef Modifier) {
   return false;
 }
 
+void AArch64::ExtensionSet::reconstructFromParsedFeatures(
+    const std::vector<std::string> &Features) {
+  assert(Touched.none() && "Bitset already initialized");
+  for (auto &F : Features) {
+    bool IsNegated = F[0] == '-';
+    if (auto AE = targetFeatureToExtension(F)) {
+      Touched.set(AE->ID);
+      if (IsNegated)
+        Enabled.reset(AE->ID);
+      else
+        Enabled.set(AE->ID);
+    }
+  }
+}
+
 const AArch64::ExtensionInfo &
 AArch64::getExtensionByID(AArch64::ArchExtKind ExtID) {
   return lookupExtensionByID(ExtID);
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 92ad4c3..7e6a881 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1662,7 +1662,8 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           else if (OverwriteExistingWeights)
             I.setMetadata(LLVMContext::MD_prof, nullptr);
         } else if (!isa<IntrinsicInst>(&I)) {
-          setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])});
+          setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])},
+                           /*IsExpected=*/false);
         }
       }
     } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) {
@@ -1673,7 +1674,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           if (cast<CallBase>(I).isIndirectCall()) {
             I.setMetadata(LLVMContext::MD_prof, nullptr);
           } else {
-            setBranchWeights(I, {uint32_t(0)});
+            setBranchWeights(I, {uint32_t(0)}, /*IsExpected=*/false);
           }
         }
       }
@@ -1756,7 +1757,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
     if (MaxWeight > 0 &&
         (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
       LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
-      setBranchWeights(*TI, Weights);
+      setBranchWeights(*TI, Weights, /*IsExpected=*/false);
       ORE->emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
                << "most popular destination for conditional branches at "
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 8205b49..0a73c58 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -905,8 +905,14 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
 
   // (X | Op01C) + Op1C --> X + (Op01C + Op1C) iff the `or` is actually an `add`
   Constant *Op01C;
-  if (match(Op0, m_DisjointOr(m_Value(X), m_ImmConstant(Op01C))))
-    return BinaryOperator::CreateAdd(X, ConstantExpr::getAdd(Op01C, Op1C));
+  if (match(Op0, m_DisjointOr(m_Value(X), m_ImmConstant(Op01C)))) {
+    BinaryOperator *NewAdd =
+        BinaryOperator::CreateAdd(X, ConstantExpr::getAdd(Op01C, Op1C));
+    NewAdd->setHasNoSignedWrap(Add.hasNoSignedWrap() &&
+                               willNotOverflowSignedAdd(Op01C, Op1C, Add));
+    NewAdd->setHasNoUnsignedWrap(Add.hasNoUnsignedWrap());
+    return NewAdd;
+  }
 
   // (X | C2) + C --> (X | C2) ^ C2 iff (C2 == -C)
   const APInt *C2;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 0632f3c..4346a07 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2618,6 +2618,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // ldexp(x, zext(i1 y)) -> fmul x, (select y, 2.0, 1.0)
+    Value *ExtSrc;
+    if (match(Exp, m_ZExt(m_Value(ExtSrc))) &&
+        ExtSrc->getType()->getScalarSizeInBits() == 1) {
+      Value *Select =
+          Builder.CreateSelect(ExtSrc, ConstantFP::get(II->getType(), 2.0),
+                               ConstantFP::get(II->getType(), 1.0));
+      return BinaryOperator::CreateFMulFMF(Src, Select, II);
+    }
+
     break;
   }
   case Intrinsic::ptrauth_auth:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 4203147..34b0f8b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2441,9 +2441,10 @@ Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
       Type *TruncTy = ShType->getWithNewBitWidth(TypeBits - Amt);
       Constant *NewC =
           ConstantInt::get(TruncTy, RHSC.ashr(*ShiftAmt).trunc(TypeBits - Amt));
-      return new ICmpInst(
-          CmpPred, Builder.CreateTrunc(X, TruncTy, "", Shl->hasNoSignedWrap()),
-          NewC);
+      return new ICmpInst(CmpPred,
+                          Builder.CreateTrunc(X, TruncTy, "", /*IsNUW=*/false,
+                                              Shl->hasNoSignedWrap()),
+                          NewC);
     }
   }
 
@@ -5548,8 +5549,8 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   }
 
   // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
-  if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
-      match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      match(Op1, m_And(m_Value(C), m_Value(D)))) {
     Value *X = nullptr, *Y = nullptr, *Z = nullptr;
 
     if (A == C) {
@@ -5570,10 +5571,26 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
       Z = B;
     }
 
-    if (X) { // Build (X^Y) & Z
-      Op1 = Builder.CreateXor(X, Y);
-      Op1 = Builder.CreateAnd(Op1, Z);
-      return new ICmpInst(Pred, Op1, Constant::getNullValue(Op1->getType()));
+    if (X) {
+      // If X^Y is a negative power of two, then `icmp eq/ne (Z & NegP2), 0`
+      // will fold to `icmp ult/uge Z, -NegP2` incurringb no additional
+      // instructions.
+      const APInt *C0, *C1;
+      bool XorIsNegP2 = match(X, m_APInt(C0)) && match(Y, m_APInt(C1)) &&
+                        (*C0 ^ *C1).isNegatedPowerOf2();
+
+      // If either Op0/Op1 are both one use or X^Y will constant fold and one of
+      // Op0/Op1 are one use, proceed. In those cases we are instruction neutral
+      // but `icmp eq/ne A, 0` is easier to analyze than `icmp eq/ne A, B`.
+      int UseCnt =
+          int(Op0->hasOneUse()) + int(Op1->hasOneUse()) +
+          (int(match(X, m_ImmConstant()) && match(Y, m_ImmConstant())));
+      if (XorIsNegP2 || UseCnt >= 2) {
+        // Build (X^Y) & Z
+        Op1 = Builder.CreateXor(X, Y);
+        Op1 = Builder.CreateAnd(Op1, Z);
+        return new ICmpInst(Pred, Op1, Constant::getNullValue(Op1->getType()));
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 9ff817d..4a014ab 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -767,11 +767,20 @@ Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *C1,
   // (C2 >> X) >> C1 --> (C2 >> C1) >> X
   Constant *C2;
   Value *X;
-  if (match(Op0, m_BinOp(I.getOpcode(), m_ImmConstant(C2), m_Value(X))))
-    return BinaryOperator::Create(
+  bool IsLeftShift = I.getOpcode() == Instruction::Shl;
+  if (match(Op0, m_BinOp(I.getOpcode(), m_ImmConstant(C2), m_Value(X)))) {
+    Instruction *R = BinaryOperator::Create(
         I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), C2, C1), X);
+    BinaryOperator *BO0 = cast<BinaryOperator>(Op0);
+    if (IsLeftShift) {
+      R->setHasNoUnsignedWrap(I.hasNoUnsignedWrap() &&
+                              BO0->hasNoUnsignedWrap());
+      R->setHasNoSignedWrap(I.hasNoSignedWrap() && BO0->hasNoSignedWrap());
+    } else
+      R->setIsExact(I.isExact() && BO0->isExact());
+    return R;
+  }
 
-  bool IsLeftShift = I.getOpcode() == Instruction::Shl;
   Type *Ty = I.getType();
   unsigned TypeBits = Ty->getScalarSizeInBits();
 
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 0a3d8d6..731104d 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1878,7 +1878,7 @@ void CHR::fixupBranchesAndSelects(CHRScope *Scope,
       static_cast<uint32_t>(CHRBranchBias.scale(1000)),
       static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)),
   };
-  setBranchWeights(*MergedBR, Weights);
+  setBranchWeights(*MergedBR, Weights, /*IsExpected=*/false);
   CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1]
             << "\n");
 }
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 23a7c6a..6db76ca 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -259,7 +259,8 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
       promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
 
   if (AttachProfToDirectCall) {
-    setBranchWeights(NewInst, {static_cast<uint32_t>(Count)});
+    setBranchWeights(NewInst, {static_cast<uint32_t>(Count)},
+                     /*IsExpected=*/false);
   }
 
   using namespace ore;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 2269c2e..ac6d334 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1474,7 +1474,8 @@ void PGOUseFunc::populateCoverage(IndexedInstrProfReader *PGOReader) {
     for (auto *Succ : successors(&BB))
       Weights.push_back((Coverage[Succ] || !Coverage[&BB]) ? 1 : 0);
     if (Weights.size() >= 2)
-      llvm::setBranchWeights(*BB.getTerminator(), Weights);
+      llvm::setBranchWeights(*BB.getTerminator(), Weights,
+                             /*IsExpected=*/false);
   }
 
   unsigned NumCorruptCoverage = 0;
@@ -2260,7 +2261,7 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
 
   misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
 
-  setBranchWeights(*TI, Weights);
+  setBranchWeights(*TI, Weights, /*IsExpected=*/false);
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
     if (BrCondStr.empty())
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index fd0f69e..fa93f4b 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -402,8 +402,7 @@ bool MemOPSizeOpt::perform(MemOp MO) {
   // If all promoted, we don't need the MD.prof metadata.
   if (SavedRemainCount > 0 || Version != NumVals) {
     // Otherwise we need update with the un-promoted records back.
-    ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
-    annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
+    annotateValueSite(*Func.getParent(), *MO.I, RemainingVDs, SavedRemainCount,
                       IPVK_MemOPSize, NumVals);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 74a8f19..b958383 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -231,7 +231,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
       Weights[0] = BP.getCompl().getNumerator();
       Weights[1] = BP.getNumerator();
     }
-    setBranchWeights(*PredBr, Weights);
+    setBranchWeights(*PredBr, Weights, hasBranchWeightOrigin(*PredBr));
   }
 }
 
@@ -2618,7 +2618,7 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
       Weights.push_back(Prob.getNumerator());
 
     auto TI = BB->getTerminator();
-    setBranchWeights(*TI, Weights);
+    setBranchWeights(*TI, Weights, hasBranchWeightOrigin(*TI));
   }
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 6f87e4d..17c5a4e 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -102,7 +102,7 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   misexpect::checkExpectAnnotations(SI, Weights, /*IsFrontend=*/true);
 
   SI.setCondition(ArgValue);
-  setBranchWeights(SI, Weights);
+  setBranchWeights(SI, Weights, /*IsExpected=*/true);
   return true;
 }
 
@@ -262,11 +262,13 @@ static void handlePhiDef(CallInst *Expect) {
     if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
       BI->setMetadata(LLVMContext::MD_prof,
                       MDB.createBranchWeights(LikelyBranchWeightVal,
-                                              UnlikelyBranchWeightVal));
+                                              UnlikelyBranchWeightVal,
+                                              /*IsExpected=*/true));
     else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
       BI->setMetadata(LLVMContext::MD_prof,
                       MDB.createBranchWeights(UnlikelyBranchWeightVal,
-                                              LikelyBranchWeightVal));
+                                              LikelyBranchWeightVal,
+                                              /*IsExpected=*/true));
   }
 }
 
@@ -331,12 +333,12 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   SmallVector<uint32_t, 4> ExpectedWeights;
   if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
       (Predicate == CmpInst::ICMP_EQ)) {
-    Node =
-        MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
+    Node = MDB.createBranchWeights(
+        LikelyBranchWeightVal, UnlikelyBranchWeightVal, /*IsExpected=*/true);
     ExpectedWeights = {LikelyBranchWeightVal, UnlikelyBranchWeightVal};
   } else {
-    Node =
-        MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
+    Node = MDB.createBranchWeights(UnlikelyBranchWeightVal,
+                                   LikelyBranchWeightVal, /*IsExpected=*/true);
     ExpectedWeights = {UnlikelyBranchWeightVal, LikelyBranchWeightVal};
   }
 
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index c73d7c8..f36e21b 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -302,98 +302,7 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
   return Res;
 }
 
-/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
-/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
-/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
-/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
-/// even x in Bitwidth-bit arithmetic.
-static unsigned CarmichaelShift(unsigned Bitwidth) {
-  if (Bitwidth < 3)
-    return Bitwidth - 1;
-  return Bitwidth - 2;
-}
-
-/// Add the extra weight 'RHS' to the existing weight 'LHS',
-/// reducing the combined weight using any special properties of the operation.
-/// The existing weight LHS represents the computation X op X op ... op X where
-/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
-/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
-/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
-/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
-static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
-  // If we were working with infinite precision arithmetic then the combined
-  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
-  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
-  // for nilpotent operations and addition, but not for idempotent operations
-  // and multiplication), so it is important to correctly reduce the combined
-  // weight back into range if wrapping would be wrong.
-
-  // If RHS is zero then the weight didn't change.
-  if (RHS.isMinValue())
-    return;
-  // If LHS is zero then the combined weight is RHS.
-  if (LHS.isMinValue()) {
-    LHS = RHS;
-    return;
-  }
-  // From this point on we know that neither LHS nor RHS is zero.
-
-  if (Instruction::isIdempotent(Opcode)) {
-    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
-    // weight of 1.  Keeping weights at zero or one also means that wrapping is
-    // not a problem.
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
-    return; // Return a weight of 1.
-  }
-  if (Instruction::isNilpotent(Opcode)) {
-    // Nilpotent means X op X === 0, so reduce weights modulo 2.
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
-    LHS = 0; // 1 + 1 === 0 modulo 2.
-    return;
-  }
-  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
-    // TODO: Reduce the weight by exploiting nsw/nuw?
-    LHS += RHS;
-    return;
-  }
-
-  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
-         "Unknown associative operation!");
-  unsigned Bitwidth = LHS.getBitWidth();
-  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
-  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
-  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
-  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
-  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
-  // which by a happy accident means that they can always be represented using
-  // Bitwidth bits.
-  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
-  // the Carmichael number).
-  if (Bitwidth > 3) {
-    /// CM - The value of Carmichael's lambda function.
-    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
-    // Any weight W >= Threshold can be replaced with W - CM.
-    APInt Threshold = CM + Bitwidth;
-    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
-    // For Bitwidth 4 or more the following sum does not overflow.
-    LHS += RHS;
-    while (LHS.uge(Threshold))
-      LHS -= CM;
-  } else {
-    // To avoid problems with overflow do everything the same as above but using
-    // a larger type.
-    unsigned CM = 1U << CarmichaelShift(Bitwidth);
-    unsigned Threshold = CM + Bitwidth;
-    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
-           "Weights not reduced!");
-    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
-    while (Total >= Threshold)
-      Total -= CM;
-    LHS = Total;
-  }
-}
-
-using RepeatedValue = std::pair<Value*, APInt>;
+using RepeatedValue = std::pair<Value *, uint64_t>;
 
 /// Given an associative binary expression, return the leaf
 /// nodes in Ops along with their weights (how many times the leaf occurs).  The
@@ -475,7 +384,6 @@ static bool LinearizeExprTree(Instruction *I,
   assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) &&
          "Expected a UnaryOperator or BinaryOperator!");
   LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
-  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
   unsigned Opcode = I->getOpcode();
   assert(I->isAssociative() && I->isCommutative() &&
          "Expected an associative and commutative operation!");
@@ -490,8 +398,8 @@ static bool LinearizeExprTree(Instruction *I,
   // with their weights, representing a certain number of paths to the operator.
   // If an operator occurs in the worklist multiple times then we found multiple
   // ways to get to it.
-  SmallVector<std::pair<Instruction*, APInt>, 8> Worklist; // (Op, Weight)
-  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+  SmallVector<std::pair<Instruction *, uint64_t>, 8> Worklist; // (Op, Weight)
+  Worklist.push_back(std::make_pair(I, 1));
   bool Changed = false;
 
   // Leaves of the expression are values that either aren't the right kind of
@@ -509,7 +417,7 @@ static bool LinearizeExprTree(Instruction *I,
 
   // Leaves - Keeps track of the set of putative leaves as well as the number of
   // paths to each leaf seen so far.
-  using LeafMap = DenseMap<Value *, APInt>;
+  using LeafMap = DenseMap<Value *, uint64_t>;
   LeafMap Leaves; // Leaf -> Total weight so far.
   SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
   const DataLayout DL = I->getModule()->getDataLayout();
@@ -518,8 +426,8 @@ static bool LinearizeExprTree(Instruction *I,
   SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme.
 #endif
   while (!Worklist.empty()) {
-    std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
-    I = P.first; // We examine the operands of this binary operator.
+    // We examine the operands of this binary operator.
+    auto [I, Weight] = Worklist.pop_back_val();
 
     if (isa<OverflowingBinaryOperator>(I)) {
       Flags.HasNUW &= I->hasNoUnsignedWrap();
@@ -528,7 +436,6 @@ static bool LinearizeExprTree(Instruction *I,
 
     for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands.
       Value *Op = I->getOperand(OpIdx);
-      APInt Weight = P.second; // Number of paths to this operand.
       LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
       assert(!Op->use_empty() && "No uses, so how did we get to it?!");
 
@@ -562,7 +469,8 @@ static bool LinearizeExprTree(Instruction *I,
                "In leaf map but not visited!");
 
         // Update the number of paths to the leaf.
-        IncorporateWeight(It->second, Weight, Opcode);
+        It->second += Weight;
+        assert(It->second >= Weight && "Weight overflows");
 
         // If we still have uses that are not accounted for by the expression
         // then it is not safe to modify the value.
@@ -625,10 +533,7 @@ static bool LinearizeExprTree(Instruction *I,
       // Node initially thought to be a leaf wasn't.
       continue;
     assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
-    APInt Weight = It->second;
-    if (Weight.isMinValue())
-      // Leaf already output or weight reduction eliminated it.
-      continue;
+    uint64_t Weight = It->second;
     // Ensure the leaf is only output once.
     It->second = 0;
     Ops.push_back(std::make_pair(V, Weight));
@@ -642,7 +547,7 @@ static bool LinearizeExprTree(Instruction *I,
   if (Ops.empty()) {
     Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
     assert(Identity && "Associative operation without identity!");
-    Ops.emplace_back(Identity, APInt(Bitwidth, 1));
+    Ops.emplace_back(Identity, 1);
   }
 
   return Changed;
@@ -1188,8 +1093,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
   Factors.reserve(Tree.size());
   for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
     RepeatedValue E = Tree[i];
-    Factors.append(E.second.getZExtValue(),
-                   ValueEntry(getRank(E.first), E.first));
+    Factors.append(E.second, ValueEntry(getRank(E.first), E.first));
   }
 
   bool FoundFactor = false;
@@ -2368,7 +2272,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
   SmallVector<ValueEntry, 8> Ops;
   Ops.reserve(Tree.size());
   for (const RepeatedValue &E : Tree)
-    Ops.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first));
+    Ops.append(E.second, ValueEntry(getRank(E.first), E.first));
 
   LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
 
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ce0f4c7..1222912 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -231,7 +231,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           // Remove weight for this case.
           std::swap(Weights[Idx + 1], Weights.back());
           Weights.pop_back();
-          setBranchWeights(*SI, Weights);
+          setBranchWeights(*SI, Weights, hasBranchWeightOrigin(MD));
         }
         // Remove this entry.
         BasicBlock *ParentBB = SI->getParent();
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index e251693..d517ec3 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -680,7 +680,7 @@ struct WeightInfo {
 /// To avoid dealing with division rounding we can just multiple both part
 /// of weights to E and use weight as (F - I * E, E).
 static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
-  setBranchWeights(*Term, Info.Weights);
+  setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
   for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
     if (SubWeight != 0)
       // Don't set the probability of taking the edge from latch to loop header
@@ -1073,7 +1073,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   }
 
   for (const auto &[Term, Info] : Weights) {
-    setBranchWeights(*Term, Info.Weights);
+    setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
   }
 
   // Update Metadata for count of peeled off iterations.
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 3d950b1..04042e7 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -390,13 +390,13 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
       SuccsSwapped ? LoopBackWeight : ExitWeight1,
       SuccsSwapped ? ExitWeight1 : LoopBackWeight,
   };
-  setBranchWeights(LoopBI, LoopBIWeights);
+  setBranchWeights(LoopBI, LoopBIWeights, /*IsExpected=*/false);
   if (HasConditionalPreHeader) {
     const uint32_t PreHeaderBIWeights[] = {
         SuccsSwapped ? EnterWeight : ExitWeight0,
         SuccsSwapped ? ExitWeight0 : EnterWeight,
     };
-    setBranchWeights(PreHeaderBI, PreHeaderBIWeights);
+    setBranchWeights(PreHeaderBI, PreHeaderBIWeights, /*IsExpected=*/false);
   }
 }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index fe6ec88..107c8bb 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -861,26 +861,28 @@ static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
 
 // Set branch weights on SwitchInst. This sets the metadata if there is at
 // least one non-zero weight.
-static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights) {
+static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights,
+                             bool IsExpected) {
   // Check that there is at least one non-zero weight. Otherwise, pass
   // nullptr to setMetadata which will erase the existing metadata.
   MDNode *N = nullptr;
   if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; }))
-    N = MDBuilder(SI->getParent()->getContext()).createBranchWeights(Weights);
+    N = MDBuilder(SI->getParent()->getContext())
+            .createBranchWeights(Weights, IsExpected);
   SI->setMetadata(LLVMContext::MD_prof, N);
 }
 
 // Similar to the above, but for branch and select instructions that take
 // exactly 2 weights.
 static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
-                             uint32_t FalseWeight) {
+                             uint32_t FalseWeight, bool IsExpected) {
   assert(isa<BranchInst>(I) || isa<SelectInst>(I));
   // Check that there is at least one non-zero weight. Otherwise, pass
   // nullptr to setMetadata which will erase the existing metadata.
   MDNode *N = nullptr;
   if (TrueWeight || FalseWeight)
     N = MDBuilder(I->getParent()->getContext())
-            .createBranchWeights(TrueWeight, FalseWeight);
+            .createBranchWeights(TrueWeight, FalseWeight, IsExpected);
   I->setMetadata(LLVMContext::MD_prof, N);
 }
 
@@ -1338,7 +1340,7 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
 
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
 
-    setBranchWeights(NewSI, MDWeights);
+    setBranchWeights(NewSI, MDWeights, /*IsExpected=*/false);
   }
 
   EraseTerminatorAndDCECond(PTI);
@@ -3831,7 +3833,7 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
     FitWeights(NewWeights);
 
     SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(), NewWeights.end());
-    setBranchWeights(PBI, MDWeights[0], MDWeights[1]);
+    setBranchWeights(PBI, MDWeights[0], MDWeights[1], /*IsExpected=*/false);
 
     // TODO: If BB is reachable from all paths through PredBlock, then we
     // could replace PBI's branch probabilities with BI's.
@@ -4568,7 +4570,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     // Halve the weights if any of them cannot fit in an uint32_t
     FitWeights(NewWeights);
 
-    setBranchWeights(PBI, NewWeights[0], NewWeights[1]);
+    setBranchWeights(PBI, NewWeights[0], NewWeights[1], /*IsExpected=*/false);
   }
 
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
@@ -4604,7 +4606,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
         FitWeights(NewWeights);
 
-        setBranchWeights(NV, NewWeights[0], NewWeights[1]);
+        setBranchWeights(NV, NewWeights[0], NewWeights[1],
+                         /*IsExpected=*/false);
       }
     }
   }
@@ -4667,7 +4670,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
       // Create a conditional branch sharing the condition of the select.
       BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
       if (TrueWeight != FalseWeight)
-        setBranchWeights(NewBI, TrueWeight, FalseWeight);
+        setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false);
     }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
@@ -5617,7 +5620,7 @@ bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
         TrueWeight /= 2;
         FalseWeight /= 2;
       }
-      setBranchWeights(NewBI, TrueWeight, FalseWeight);
+      setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false);
     }
   }
 
@@ -6743,8 +6746,25 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     TableSize =
         (MaxCaseVal->getValue() - MinCaseVal->getValue()).getLimitedValue() + 1;
 
+  // If the default destination is unreachable, or if the lookup table covers
+  // all values of the conditional variable, branch directly to the lookup table
+  // BB. Otherwise, check that the condition is within the case range.
+  bool DefaultIsReachable = !SI->defaultDestUndefined();
+
   bool TableHasHoles = (NumResults < TableSize);
-  bool NeedMask = (TableHasHoles && !HasDefaultResults);
+
+  // If the table has holes but the default destination doesn't produce any
+  // constant results, the lookup table entries corresponding to the holes will
+  // contain undefined values.
+  bool AllHolesAreUndefined = TableHasHoles && !HasDefaultResults;
+
+  // If the default destination doesn't produce a constant result but is still
+  // reachable, and the lookup table has holes, we need to use a mask to
+  // determine if the current index should load from the lookup table or jump
+  // to the default case.
+  // The mask is unnecessary if the table has holes but the default destination
+  // is unreachable, as in that case the holes must also be unreachable.
+  bool NeedMask = AllHolesAreUndefined && DefaultIsReachable;
   if (NeedMask) {
     // As an extra penalty for the validity test we require more cases.
     if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
@@ -6766,12 +6786,6 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
          "It is impossible for a switch to have more entries than the max "
          "representable value of its input integer type's size.");
 
-  // If the default destination is unreachable, or if the lookup table covers
-  // all values of the conditional variable, branch directly to the lookup table
-  // BB. Otherwise, check that the condition is within the case range.
-  bool DefaultIsReachable =
-      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
-
   // Create the BB that does the lookups.
   Module &Mod = *CommonDest->getParent()->getParent();
   BasicBlock *LookupBB = BasicBlock::Create(
@@ -6895,8 +6909,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   for (PHINode *PHI : PHIs) {
     const ResultListTy &ResultList = ResultLists[PHI];
 
-    // If using a bitmask, use any value to fill the lookup table holes.
-    Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
+    // Use any value to fill the lookup table holes.
+    Constant *DV =
+        AllHolesAreUndefined ? ResultLists[PHI][0].second : DefaultResults[PHI];
     StringRef FuncName = Fn->getName();
     SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV,
                             DL, FuncName);
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index eb1224a..a91c3ff 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2376,7 +2376,13 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
       hasFloatVersion(M, Name))
     Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
 
-  const bool UseIntrinsic = CI->doesNotAccessMemory();
+  // If we have an llvm.exp2 intrinsic, emit the llvm.ldexp intrinsic. If we
+  // have the libcall, emit the libcall.
+  //
+  // TODO: In principle we should be able to just always use the intrinsic for
+  // any doesNotAccessMemory callsite.
+
+  const bool UseIntrinsic = Callee->isIntrinsic();
   // Bail out for vectors because the code below only expects scalars.
   Type *Ty = CI->getType();
   if (!UseIntrinsic && Ty->isVectorTy())
@@ -2386,12 +2392,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
   // exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < IntSize
   Value *Op = CI->getArgOperand(0);
   if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
-      hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+      (UseIntrinsic ||
+       hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl))) {
     if (Value *Exp = getIntToFPVal(Op, B, TLI->getIntSize())) {
       Constant *One = ConstantFP::get(Ty, 1.0);
 
-      // TODO: Emitting the intrinsic should not depend on whether the libcall
-      // is available.
       if (UseIntrinsic) {
         return copyFlags(*CI, B.CreateIntrinsic(Intrinsic::ldexp,
                                                 {Ty, Exp->getType()},
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c7c19ef..1acecf2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2145,7 +2145,7 @@ public:
 
     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
     if (AddBranchWeights)
-      setBranchWeights(BI, SCEVCheckBypassWeights);
+      setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
     return SCEVCheckBlock;
   }
@@ -2173,7 +2173,7 @@ public:
     BranchInst &BI =
         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
     if (AddBranchWeights) {
-      setBranchWeights(BI, MemCheckBypassWeights);
+      setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
     }
     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
     MemCheckBlock->getTerminator()->setDebugLoc(
@@ -2889,7 +2889,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
-    setBranchWeights(BI, MinItersBypassWeights);
+    setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
   LoopBypassBlocks.push_back(TCCheckBlock);
 }
@@ -3128,7 +3128,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
       unsigned TripCount = UF * VF.getKnownMinValue();
       assert(TripCount > 0 && "trip count should not be zero");
       const uint32_t Weights[] = {1, TripCount - 1};
-      setBranchWeights(BI, Weights);
+      setBranchWeights(BI, Weights, /*IsExpected=*/false);
     }
   }
 
@@ -7669,7 +7669,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
-    setBranchWeights(BI, MinItersBypassWeights);
+    setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
 
   return TCCheckBlock;
@@ -7826,7 +7826,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
     const uint32_t Weights[] = {EstimatedSkipCount,
                                 MainLoopStep - EstimatedSkipCount};
-    setBranchWeights(BI, Weights);
+    setBranchWeights(BI, Weights, /*IsExpected=*/false);
   }
   ReplaceInstWithInst(Insert->getTerminator(), &BI);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 943edc3..5bb88e4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1325,20 +1325,7 @@ public:
   bool onlyFirstLaneUsed(const VPValue *Op) const override;
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
-    assert(is_contained(operands(), Op) &&
-           "Op must be an operand of the recipe");
-    if (getOperand(0) != Op)
-      return false;
-    switch (getOpcode()) {
-    default:
-      return false;
-    case VPInstruction::BranchOnCount:
-    case VPInstruction::CanonicalIVIncrementForPart:
-      return true;
-    };
-    llvm_unreachable("switch should return");
-  }
+  bool onlyFirstPartUsed(const VPValue *Op) const override;
 
   /// Returns true if this VPInstruction produces a scalar value from a vector,
   /// e.g. by performing a reduction or extracting a lane.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cb707d7..7a48245 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -138,6 +138,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
     case VPInstruction::CalculateTripCountMinusVF:
     case VPInstruction::CanonicalIVIncrementForPart:
     case VPInstruction::ExtractFromEnd:
+    case VPInstruction::FirstOrderRecurrenceSplice:
     case VPInstruction::LogicalAnd:
     case VPInstruction::PtrAdd:
       return false;
@@ -324,9 +325,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
 
   if (Instruction::isBinaryOp(getOpcode())) {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
-    if (Part != 0 && vputils::onlyFirstPartUsed(this))
-      return State.get(this, 0, OnlyFirstLaneUsed);
-
     Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
     Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
     auto *Res =
@@ -628,6 +626,7 @@ void VPInstruction::execute(VPTransformState &State) {
       canGenerateScalarForFirstLane() &&
       (vputils::onlyFirstLaneUsed(this) || isVectorToScalar());
   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
+  bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     if (GeneratesPerAllLanes) {
       for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
@@ -639,6 +638,13 @@ void VPInstruction::execute(VPTransformState &State) {
       continue;
     }
 
+    if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
+      Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
+      State.set(this, Part0, Part,
+                /*IsScalar*/ GeneratesPerFirstLaneOnly);
+      continue;
+    }
+
     Value *GeneratedValue = generatePerPart(State, Part);
     if (!hasResult())
       continue;
@@ -674,6 +680,25 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   llvm_unreachable("switch should return");
 }
 
+bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
+  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
+  if (Instruction::isBinaryOp(getOpcode()))
+    return vputils::onlyFirstPartUsed(this);
+
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::ICmp:
+  case Instruction::Select:
+    return vputils::onlyFirstPartUsed(this);
+  case VPInstruction::BranchOnCount:
+  case VPInstruction::BranchOnCond:
+  case VPInstruction::CanonicalIVIncrementForPart:
+    return true;
+  };
+  llvm_unreachable("switch should return");
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInstruction::dump() const {
   VPSlotTracker SlotTracker(getParent()->getPlan());
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
index 3db2183..6d0ca70 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -1,93 +1,5 @@
 ; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(
-define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(
-define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(
-define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(
-define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(
-define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(
-define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(
-define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(
-define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(
-define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(
-define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(
-define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 {
-main_body:
-  %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
 ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(
 define float @raw_buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
 main_body:
@@ -440,18 +352,6 @@ main_body:
   ret float %r
 }
 
-declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
-
 declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) #0
diff --git a/llvm/test/Bindings/OCaml/core.ml b/llvm/test/Bindings/OCaml/core.ml
index 36f4c46..923a354 100644
--- a/llvm/test/Bindings/OCaml/core.ml
+++ b/llvm/test/Bindings/OCaml/core.ml
@@ -1150,7 +1150,7 @@ let test_builder () =
     (* CHECK: ret{{.*}}P1
      *)
     let ret = build_ret p1 atentry in
-    position_before ret atentry
+    position_before_dbg_records ret atentry
   end;
 
   (* see test/Feature/exception.ll *)
diff --git a/llvm/test/Bindings/llvm-c/debug_info.ll b/llvm/test/Bindings/llvm-c/debug_info.ll
index 9358bac..71986fb 100644
--- a/llvm/test/Bindings/llvm-c/debug_info.ll
+++ b/llvm/test/Bindings/llvm-c/debug_info.ll
@@ -10,7 +10,10 @@
 ; CHECK-NEXT:   call void @llvm.dbg.declare(metadata i64 0, metadata !40, metadata !DIExpression()), !dbg !43
 ; CHECK-NEXT:   br label %vars
 ; CHECK:      vars:
+; CHECK-NEXT:   %p1 = phi i64 [ 0, %entry ]
+; CHECK-NEXT:   %p2 = phi i64 [ 0, %entry ]
 ; CHECK-NEXT:   call void @llvm.dbg.value(metadata i64 0, metadata !41, metadata !DIExpression(DW_OP_constu, 0, DW_OP_stack_value)), !dbg !44
+; CHECK-NEXT:   %a = add i64 %p1, %p2
 ; CHECK-NEXT:   ret i64 0
 ; CHECK-NEXT: }
 
diff --git a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
index 05b6ef4..1b6f2c4 100644
--- a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
+++ b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
@@ -11,7 +11,10 @@
 ; CHECK-NEXT:     #dbg_declare(i64 0, !40, !DIExpression(), !43)
 ; CHECK-NEXT:   br label %vars
 ; CHECK:      vars:
+; CHECK-NEXT:   %p1 = phi i64 [ 0, %entry ]
+; CHECK-NEXT:   %p2 = phi i64 [ 0, %entry ]
 ; CHECK-NEXT:     #dbg_value(i64 0, !41, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !44)
+; CHECK-NEXT:   %a = add i64 %p1, %p2
 ; CHECK-NEXT:   ret i64 0
 ; CHECK-NEXT: }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index 4c0d1ef..410c2d9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -28,14 +28,12 @@ entry:
 define <16 x i32> @mul_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-LABEL: mul_i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll v4.8h, v1.8b, #0
-; CHECK-SD-NEXT:    ushll2 v5.8h, v0.16b, #0
-; CHECK-SD-NEXT:    ushll2 v6.8h, v1.16b, #0
-; CHECK-SD-NEXT:    umull v0.4s, v2.4h, v4.4h
-; CHECK-SD-NEXT:    umull2 v1.4s, v2.8h, v4.8h
-; CHECK-SD-NEXT:    umull2 v3.4s, v5.8h, v6.8h
-; CHECK-SD-NEXT:    umull v2.4s, v5.4h, v6.4h
+; CHECK-SD-NEXT:    umull v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v4.4h, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: mul_i32:
@@ -59,26 +57,20 @@ entry:
 define <16 x i64> @mul_i64(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-LABEL: mul_i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-SD-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-SD-NEXT:    umull v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushll v3.4s, v2.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-SD-NEXT:    ushll v16.4s, v1.4h, #0
-; CHECK-SD-NEXT:    ushll2 v7.4s, v3.8h, #0
-; CHECK-SD-NEXT:    ushll2 v17.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll2 v18.4s, v1.8h, #0
-; CHECK-SD-NEXT:    umull2 v1.2d, v4.4s, v6.4s
-; CHECK-SD-NEXT:    umull v0.2d, v4.2s, v6.2s
-; CHECK-SD-NEXT:    umull2 v3.2d, v2.4s, v7.4s
-; CHECK-SD-NEXT:    umull v2.2d, v2.2s, v7.2s
-; CHECK-SD-NEXT:    umull v4.2d, v5.2s, v16.2s
-; CHECK-SD-NEXT:    umull2 v7.2d, v17.4s, v18.4s
-; CHECK-SD-NEXT:    umull2 v5.2d, v5.4s, v16.4s
-; CHECK-SD-NEXT:    umull v6.2d, v17.2s, v18.2s
+; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: mul_i64:
@@ -139,17 +131,12 @@ entry:
 define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
 ; CHECK-SD-LABEL: mla_i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v6.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll v7.8h, v1.8b, #0
-; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-NEXT:    umlal v2.4s, v6.4h, v7.4h
-; CHECK-SD-NEXT:    umlal2 v3.4s, v6.8h, v7.8h
-; CHECK-SD-NEXT:    umlal2 v5.4s, v0.8h, v1.8h
-; CHECK-SD-NEXT:    umlal v4.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NEXT:    mov v1.16b, v3.16b
-; CHECK-SD-NEXT:    mov v2.16b, v4.16b
+; CHECK-SD-NEXT:    umull2 v7.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    umull v6.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddw2 v5.4s, v5.4s, v7.8h
+; CHECK-SD-NEXT:    uaddw v0.4s, v2.4s, v6.4h
+; CHECK-SD-NEXT:    uaddw2 v1.4s, v3.4s, v6.8h
+; CHECK-SD-NEXT:    uaddw v2.4s, v4.4s, v7.4h
 ; CHECK-SD-NEXT:    mov v3.16b, v5.16b
 ; CHECK-SD-NEXT:    ret
 ;
@@ -179,35 +166,22 @@ entry:
 define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
 ; CHECK-SD-LABEL: mla_i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov v17.16b, v7.16b
-; CHECK-SD-NEXT:    mov v16.16b, v6.16b
-; CHECK-SD-NEXT:    ushll v6.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-NEXT:    ushll v7.8h, v1.8b, #0
-; CHECK-SD-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-NEXT:    ushll v18.4s, v6.4h, #0
-; CHECK-SD-NEXT:    ushll2 v21.4s, v6.8h, #0
-; CHECK-SD-NEXT:    ushll v19.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ushll v20.4s, v7.4h, #0
-; CHECK-SD-NEXT:    ushll v22.4s, v1.4h, #0
-; CHECK-SD-NEXT:    ushll2 v23.4s, v7.8h, #0
-; CHECK-SD-NEXT:    ldp q6, q7, [sp]
-; CHECK-SD-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-SD-NEXT:    umlal2 v3.2d, v18.4s, v20.4s
-; CHECK-SD-NEXT:    umlal v2.2d, v18.2s, v20.2s
-; CHECK-SD-NEXT:    umlal v16.2d, v19.2s, v22.2s
-; CHECK-SD-NEXT:    umlal2 v5.2d, v21.4s, v23.4s
-; CHECK-SD-NEXT:    umlal v4.2d, v21.2s, v23.2s
-; CHECK-SD-NEXT:    umlal2 v17.2d, v19.4s, v22.4s
-; CHECK-SD-NEXT:    umlal2 v7.2d, v0.4s, v1.4s
-; CHECK-SD-NEXT:    umlal v6.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NEXT:    mov v1.16b, v3.16b
-; CHECK-SD-NEXT:    mov v2.16b, v4.16b
-; CHECK-SD-NEXT:    mov v3.16b, v5.16b
-; CHECK-SD-NEXT:    mov v4.16b, v16.16b
-; CHECK-SD-NEXT:    mov v5.16b, v17.16b
+; CHECK-SD-NEXT:    umull v16.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ldp q20, q21, [sp]
+; CHECK-SD-NEXT:    ushll v17.4s, v16.4h, #0
+; CHECK-SD-NEXT:    ushll2 v16.4s, v16.8h, #0
+; CHECK-SD-NEXT:    ushll2 v19.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v18.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddw2 v1.2d, v3.2d, v17.4s
+; CHECK-SD-NEXT:    uaddw v0.2d, v2.2d, v17.2s
+; CHECK-SD-NEXT:    uaddw2 v3.2d, v5.2d, v16.4s
+; CHECK-SD-NEXT:    uaddw v2.2d, v4.2d, v16.2s
+; CHECK-SD-NEXT:    uaddw2 v16.2d, v21.2d, v19.4s
+; CHECK-SD-NEXT:    uaddw v4.2d, v6.2d, v18.2s
+; CHECK-SD-NEXT:    uaddw2 v5.2d, v7.2d, v18.4s
+; CHECK-SD-NEXT:    uaddw v6.2d, v20.2d, v19.2s
+; CHECK-SD-NEXT:    mov v7.16b, v16.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: mla_i64:
diff --git a/llvm/test/CodeGen/AArch64/addp-shuffle.ll b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
index 7cc5041..fb96d11 100644
--- a/llvm/test/CodeGen/AArch64/addp-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/addp-shuffle.ll
@@ -136,15 +136,13 @@ define <4 x double> @deinterleave_shuffle_v8f64(<8 x double> %a) {
 define <4 x i32> @udot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: udot:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    umull2 v5.4s, v3.8h, v4.8h
-; CHECK-NEXT:    umull v3.4s, v3.4h, v4.4h
-; CHECK-NEXT:    umull2 v4.4s, v1.8h, v2.8h
-; CHECK-NEXT:    umull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    addp v2.4s, v3.4s, v5.4s
+; CHECK-NEXT:    umull v3.8h, v1.8b, v2.8b
+; CHECK-NEXT:    umull2 v1.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ushll2 v2.4s, v3.8h, #0
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NEXT:    ushll2 v4.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    addp v1.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    addp v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -165,15 +163,13 @@ define <4 x i32> @udot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) {
 define <4 x i32> @sdot(<4 x i32> %z, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: sdot:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    smull2 v5.4s, v3.8h, v4.8h
-; CHECK-NEXT:    smull v3.4s, v3.4h, v4.4h
-; CHECK-NEXT:    smull2 v4.4s, v1.8h, v2.8h
-; CHECK-NEXT:    smull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    addp v2.4s, v3.4s, v5.4s
+; CHECK-NEXT:    smull v3.8h, v1.8b, v2.8b
+; CHECK-NEXT:    smull2 v1.8h, v1.16b, v2.16b
+; CHECK-NEXT:    sshll2 v2.4s, v3.8h, #0
+; CHECK-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-NEXT:    sshll2 v4.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    addp v1.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    addp v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
index 296be83..7056a4d 100644
--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -412,10 +412,10 @@ define half @scvtf_f16_i32_7(i32 %int) {
 ; CHECK-NO16-LABEL: scvtf_f16_i32_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -432,10 +432,10 @@ define half @scvtf_f16_i32_15(i32 %int) {
 ; CHECK-NO16-LABEL: scvtf_f16_i32_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -452,10 +452,10 @@ define half @scvtf_f16_i64_7(i64 %long) {
 ; CHECK-NO16-LABEL: scvtf_f16_i64_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -472,10 +472,10 @@ define half @scvtf_f16_i64_15(i64 %long) {
 ; CHECK-NO16-LABEL: scvtf_f16_i64_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -574,10 +574,10 @@ define half @ucvtf_f16_i32_7(i32 %int) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i32_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -594,10 +594,10 @@ define half @ucvtf_f16_i32_15(i32 %int) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i32_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -614,10 +614,10 @@ define half @ucvtf_f16_i64_7(i64 %long) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i64_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -634,10 +634,10 @@ define half @ucvtf_f16_i64_15(i64 %long) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i64_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/fdiv-const.ll b/llvm/test/CodeGen/AArch64/fdiv-const.ll
index 5a8f733..7aa89db 100644
--- a/llvm/test/CodeGen/AArch64/fdiv-const.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv-const.ll
@@ -4,8 +4,8 @@
 define float @divf32_2(float %a) nounwind {
 ; CHECK-LABEL: divf32_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s1, #2.00000000
-; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    fmov s1, #0.50000000
+; CHECK-NEXT:    fmul s0, s0, s1
 ; CHECK-NEXT:    ret
   %r = fdiv float %a, 2.0
   ret float %r
@@ -46,8 +46,8 @@ define float @divf32_p75_arcp(float %a) nounwind {
 define half @divf16_2(half %a) nounwind {
 ; CHECK-LABEL: divf16_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov h1, #2.00000000
-; CHECK-NEXT:    fdiv h0, h0, h1
+; CHECK-NEXT:    fmov h1, #0.50000000
+; CHECK-NEXT:    fmul h0, h0, h1
 ; CHECK-NEXT:    ret
   %r = fdiv half %a, 2.0
   ret half %r
@@ -67,9 +67,9 @@ define half @divf16_32768(half %a) nounwind {
 define half @divf16_32768_arcp(half %a) nounwind {
 ; CHECK-LABEL: divf16_32768_arcp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #512 // =0x200
+; CHECK-NEXT:    mov w8, #30720 // =0x7800
 ; CHECK-NEXT:    fmov h1, w8
-; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    fdiv h0, h0, h1
 ; CHECK-NEXT:    ret
   %r = fdiv arcp half %a, 32768.0
   ret half %r
@@ -78,8 +78,8 @@ define half @divf16_32768_arcp(half %a) nounwind {
 define double @divf64_2(double %a) nounwind {
 ; CHECK-LABEL: divf64_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d1, #2.00000000
-; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    fmov d1, #0.50000000
+; CHECK-NEXT:    fmul d0, d0, d1
 ; CHECK-NEXT:    ret
   %r = fdiv double %a, 2.0
   ret double %r
@@ -88,8 +88,8 @@ define double @divf64_2(double %a) nounwind {
 define <4 x float> @divv4f32_2(<4 x float> %a) nounwind {
 ; CHECK-LABEL: divv4f32_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4s, #64, lsl #24
-; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    movi v1.4s, #63, lsl #24
+; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %r = fdiv <4 x float> %a, <float 2.0, float 2.0, float 2.0, float 2.0>
   ret <4 x float> %r
@@ -141,9 +141,8 @@ define <4 x float> @divv4f32_24816(<4 x float> %a) nounwind {
 define <vscale x 4 x float> @divnxv4f32_2(<vscale x 4 x float> %a) nounwind {
 ; CHECK-LABEL: divnxv4f32_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z1.s, #2.00000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, #0.5
 ; CHECK-NEXT:    ret
   %r = fdiv <vscale x 4 x float> %a, splat (float 2.0)
   ret <vscale x 4 x float> %r
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
index 4192745..98276b6 100644
--- a/llvm/test/CodeGen/AArch64/frem-power2.ll
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -5,11 +5,12 @@
 define float @frem2(float %x) {
 ; CHECK-SD-LABEL: frem2:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, #2.00000000
+; CHECK-SD-NEXT:    fmov s1, #0.50000000
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s1, s2, s1, s0
+; CHECK-SD-NEXT:    fmov s2, #-2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s1, s1, s2, s0
 ; CHECK-SD-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-SD-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
@@ -27,10 +28,11 @@ entry:
 define float @frem2_nsz(float %x) {
 ; CHECK-SD-LABEL: frem2_nsz:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, #2.00000000
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    fmov s1, #0.50000000
+; CHECK-SD-NEXT:    fmov s2, #-2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem2_nsz:
@@ -65,10 +67,11 @@ define float @frem2_abs(float %x) {
 ; CHECK-SD-LABEL: frem2_abs:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fabs s0, s0
-; CHECK-SD-NEXT:    fmov s1, #2.00000000
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    fmov s1, #0.50000000
+; CHECK-SD-NEXT:    fmov s2, #-2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem2_abs:
@@ -85,9 +88,9 @@ entry:
 define half @hrem2_nsz(half %x) {
 ; CHECK-SD-LABEL: hrem2_nsz:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov h1, #2.00000000
+; CHECK-SD-NEXT:    fmov h1, #0.50000000
 ; CHECK-SD-NEXT:    fmov h2, #-2.00000000
-; CHECK-SD-NEXT:    fdiv h1, h0, h1
+; CHECK-SD-NEXT:    fmul h1, h0, h1
 ; CHECK-SD-NEXT:    frintz h1, h1
 ; CHECK-SD-NEXT:    fmadd h0, h1, h2, h0
 ; CHECK-SD-NEXT:    ret
@@ -112,10 +115,11 @@ entry:
 define double @drem2_nsz(double %x) {
 ; CHECK-SD-LABEL: drem2_nsz:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov d1, #2.00000000
-; CHECK-SD-NEXT:    fdiv d2, d0, d1
-; CHECK-SD-NEXT:    frintz d2, d2
-; CHECK-SD-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-SD-NEXT:    fmov d1, #0.50000000
+; CHECK-SD-NEXT:    fmov d2, #-2.00000000
+; CHECK-SD-NEXT:    fmul d1, d0, d1
+; CHECK-SD-NEXT:    frintz d1, d1
+; CHECK-SD-NEXT:    fmadd d0, d1, d2, d0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: drem2_nsz:
@@ -176,10 +180,11 @@ entry:
 define float @fremm2_nsz(float %x) {
 ; CHECK-SD-LABEL: fremm2_nsz:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, #-2.00000000
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    fmov s1, #-0.50000000
+; CHECK-SD-NEXT:    fmov s2, #2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fremm2_nsz:
@@ -195,10 +200,11 @@ define float @frem4_abs(float %x) {
 ; CHECK-SD-LABEL: frem4_abs:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fabs s0, s0
-; CHECK-SD-NEXT:    fmov s1, #4.00000000
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    fmov s1, #0.25000000
+; CHECK-SD-NEXT:    fmov s2, #-4.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem4_abs:
@@ -216,10 +222,12 @@ define float @frem16_abs(float %x) {
 ; CHECK-SD-LABEL: frem16_abs:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fabs s0, s0
-; CHECK-SD-NEXT:    fmov s1, #16.00000000
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    mov w8, #1031798784 // =0x3d800000
+; CHECK-SD-NEXT:    fmov s2, #-16.00000000
+; CHECK-SD-NEXT:    fmov s1, w8
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem16_abs:
@@ -237,11 +245,13 @@ define float @frem4294967296_abs(float %x) {
 ; CHECK-SD-LABEL: frem4294967296_abs:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fabs s0, s0
-; CHECK-SD-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    mov w8, #796917760 // =0x2f800000
 ; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    mov w8, #-813694976 // =0xcf800000
+; CHECK-SD-NEXT:    fmov s2, w8
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem4294967296_abs:
@@ -260,11 +270,13 @@ define float @frem1152921504606846976_abs(float %x) {
 ; CHECK-SD-LABEL: frem1152921504606846976_abs:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fabs s0, s0
-; CHECK-SD-NEXT:    mov w8, #1568669696 // =0x5d800000
+; CHECK-SD-NEXT:    mov w8, #562036736 // =0x21800000
 ; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    mov w8, #-578813952 // =0xdd800000
+; CHECK-SD-NEXT:    fmov s2, w8
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem1152921504606846976_abs:
@@ -283,11 +295,13 @@ define float @frem4611686018427387904_abs(float %x) {
 ; CHECK-SD-LABEL: frem4611686018427387904_abs:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fabs s0, s0
-; CHECK-SD-NEXT:    mov w8, #1585446912 // =0x5e800000
+; CHECK-SD-NEXT:    mov w8, #545259520 // =0x20800000
 ; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    mov w8, #-562036736 // =0xde800000
+; CHECK-SD-NEXT:    fmov s2, w8
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem4611686018427387904_abs:
@@ -305,11 +319,12 @@ entry:
 define float @frem9223372036854775808_abs(float %x) {
 ; CHECK-SD-LABEL: frem9223372036854775808_abs:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi v1.2s, #95, lsl #24
+; CHECK-SD-NEXT:    movi v1.2s, #32, lsl #24
 ; CHECK-SD-NEXT:    fabs s0, s0
-; CHECK-SD-NEXT:    fdiv s2, s0, s1
-; CHECK-SD-NEXT:    frintz s2, s2
-; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    movi v2.2s, #223, lsl #24
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem9223372036854775808_abs:
@@ -326,11 +341,12 @@ entry:
 define <4 x float> @frem2_vec(<4 x float> %x) {
 ; CHECK-SD-LABEL: frem2_vec:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi v1.4s, #64, lsl #24
+; CHECK-SD-NEXT:    movi v1.4s, #63, lsl #24
+; CHECK-SD-NEXT:    movi v2.4s, #64, lsl #24
 ; CHECK-SD-NEXT:    mov v3.16b, v0.16b
-; CHECK-SD-NEXT:    fdiv v2.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT:    frintz v2.4s, v2.4s
-; CHECK-SD-NEXT:    fmls v3.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    frintz v1.4s, v1.4s
+; CHECK-SD-NEXT:    fmls v3.4s, v2.4s, v1.4s
 ; CHECK-SD-NEXT:    mvni v1.4s, #128, lsl #24
 ; CHECK-SD-NEXT:    bit v0.16b, v3.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
@@ -387,10 +403,11 @@ entry:
 define <4 x float> @frem2_nsz_vec(<4 x float> %x) {
 ; CHECK-SD-LABEL: frem2_nsz_vec:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi v1.4s, #64, lsl #24
-; CHECK-SD-NEXT:    fdiv v2.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT:    frintz v2.4s, v2.4s
-; CHECK-SD-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    movi v1.4s, #63, lsl #24
+; CHECK-SD-NEXT:    movi v2.4s, #64, lsl #24
+; CHECK-SD-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    frintz v1.4s, v1.4s
+; CHECK-SD-NEXT:    fmls v0.4s, v2.4s, v1.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem2_nsz_vec:
@@ -445,12 +462,14 @@ entry:
 define <4 x float> @frem1152921504606846976_absv(<4 x float> %x) {
 ; CHECK-SD-LABEL: frem1152921504606846976_absv:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #1568669696 // =0x5d800000
+; CHECK-SD-NEXT:    mov w8, #562036736 // =0x21800000
 ; CHECK-SD-NEXT:    fabs v0.4s, v0.4s
 ; CHECK-SD-NEXT:    dup v1.4s, w8
-; CHECK-SD-NEXT:    fdiv v2.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT:    frintz v2.4s, v2.4s
-; CHECK-SD-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    mov w8, #1568669696 // =0x5d800000
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    frintz v1.4s, v1.4s
+; CHECK-SD-NEXT:    fmls v0.4s, v2.4s, v1.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: frem1152921504606846976_absv:
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 40b8a47..33245a2 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -132,13 +132,12 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    umull2 v2.4s, v1.8h, v0.8h
-; CHECK-NEXT:    mov v3.s[0], v2.s[0]
-; CHECK-NEXT:    umlal v3.4s, v1.4h, v0.4h
-; CHECK-NEXT:    addv s0, v3.4s
+; CHECK-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-NEXT:    mov v1.s[0], v2.s[0]
+; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
 ; CHECK-NEXT:    ret
@@ -176,13 +175,12 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    smull2 v2.4s, v1.8h, v0.8h
-; CHECK-NEXT:    mov v3.s[0], v2.s[0]
-; CHECK-NEXT:    smlal v3.4s, v1.4h, v0.4h
-; CHECK-NEXT:    addv s0, v3.4s
+; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-NEXT:    mov v1.s[0], v2.s[0]
+; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
 ; CHECK-NEXT:    ret
@@ -200,19 +198,17 @@ entry:
 define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
 ; CHECK-LABEL: test_sdot_v5i8_double:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    smull2 v4.4s, v0.8h, v1.8h
-; CHECK-NEXT:    smull2 v7.4s, v2.8h, v3.8h
-; CHECK-NEXT:    mov v6.s[0], v4.s[0]
-; CHECK-NEXT:    mov v5.s[0], v7.s[0]
-; CHECK-NEXT:    smlal v6.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smlal v5.4s, v2.4h, v3.4h
-; CHECK-NEXT:    add v0.4s, v6.4s, v5.4s
+; CHECK-NEXT:    smull v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-NEXT:    sshll2 v5.4s, v2.8h, #0
+; CHECK-NEXT:    mov v3.s[0], v4.s[0]
+; CHECK-NEXT:    mov v1.s[0], v5.s[0]
+; CHECK-NEXT:    saddw v0.4s, v3.4s, v0.4h
+; CHECK-NEXT:    saddw v1.4s, v1.4s, v2.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -998,27 +994,21 @@ entry:
 define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v25i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q0, [x0]
-; CHECK-NEXT:    ldp q5, q1, [x1]
-; CHECK-NEXT:    ushll2 v3.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v6.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v4.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v7.8h, v5.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    umull v3.4s, v4.4h, v3.4h
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    umull2 v16.4s, v7.8h, v6.8h
-; CHECK-NEXT:    umull v6.4s, v7.4h, v6.4h
-; CHECK-NEXT:    mov v4.s[0], v3.s[0]
-; CHECK-NEXT:    ushll2 v3.8h, v5.16b, #0
-; CHECK-NEXT:    umlal2 v16.4s, v1.8h, v0.8h
-; CHECK-NEXT:    umlal v6.4s, v1.4h, v0.4h
-; CHECK-NEXT:    umlal v4.4s, v3.4h, v2.4h
-; CHECK-NEXT:    umlal2 v16.4s, v3.8h, v2.8h
-; CHECK-NEXT:    add v0.4s, v6.4s, v4.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v16.4s
+; CHECK-NEXT:    ldp q3, q0, [x1]
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ldp q2, q1, [x0]
+; CHECK-NEXT:    umull2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    umull v1.8h, v3.8b, v2.8b
+; CHECK-NEXT:    umull2 v2.8h, v3.16b, v2.16b
+; CHECK-NEXT:    ushll v3.4s, v4.4h, #0
+; CHECK-NEXT:    uaddl2 v4.4s, v1.8h, v0.8h
+; CHECK-NEXT:    uaddl v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    mov v5.s[0], v3.s[0]
+; CHECK-NEXT:    uaddw2 v1.4s, v4.4s, v2.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uaddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1063,27 +1053,21 @@ entry:
 define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v25i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q0, [x0]
-; CHECK-NEXT:    ldp q5, q1, [x1]
-; CHECK-NEXT:    sshll2 v3.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v6.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v4.8h, v1.16b, #0
-; CHECK-NEXT:    sshll v7.8h, v5.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    smull v3.4s, v4.4h, v3.4h
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    smull2 v16.4s, v7.8h, v6.8h
-; CHECK-NEXT:    smull v6.4s, v7.4h, v6.4h
-; CHECK-NEXT:    mov v4.s[0], v3.s[0]
-; CHECK-NEXT:    sshll2 v3.8h, v5.16b, #0
-; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v0.8h
-; CHECK-NEXT:    smlal v6.4s, v1.4h, v0.4h
-; CHECK-NEXT:    smlal v4.4s, v3.4h, v2.4h
-; CHECK-NEXT:    smlal2 v16.4s, v3.8h, v2.8h
-; CHECK-NEXT:    add v0.4s, v6.4s, v4.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v16.4s
+; CHECK-NEXT:    ldp q3, q0, [x1]
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ldp q2, q1, [x0]
+; CHECK-NEXT:    smull2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    smull v1.8h, v3.8b, v2.8b
+; CHECK-NEXT:    smull2 v2.8h, v3.16b, v2.16b
+; CHECK-NEXT:    sshll v3.4s, v4.4h, #0
+; CHECK-NEXT:    saddl2 v4.4s, v1.8h, v0.8h
+; CHECK-NEXT:    saddl v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    mov v5.s[0], v3.s[0]
+; CHECK-NEXT:    saddw2 v1.4s, v4.4s, v2.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1105,222 +1089,210 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b1, [sp, #16]
-; CHECK-NEXT:    ldr b0, [sp, #80]
-; CHECK-NEXT:    add x11, sp, #24
-; CHECK-NEXT:    ldr b3, [sp, #216]
-; CHECK-NEXT:    add x10, sp, #88
-; CHECK-NEXT:    ldr b2, [sp, #280]
-; CHECK-NEXT:    ld1 { v1.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #224
-; CHECK-NEXT:    ldr b4, [sp, #152]
-; CHECK-NEXT:    ldr b6, [sp, #480]
-; CHECK-NEXT:    ld1 { v0.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #288
-; CHECK-NEXT:    add x12, sp, #160
-; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #488
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v4.b }[1], [x12]
-; CHECK-NEXT:    ld1 { v6.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #32
-; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    add x8, sp, #104
-; CHECK-NEXT:    ld1 { v1.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #232
-; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
+; CHECK-NEXT:    ldr b0, [sp, #280]
+; CHECK-NEXT:    add x8, sp, #288
+; CHECK-NEXT:    ldr b1, [sp, #80]
+; CHECK-NEXT:    ldr b2, [sp, #152]
 ; CHECK-NEXT:    add x9, sp, #296
-; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ldr b4, [sp, #216]
+; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #88
+; CHECK-NEXT:    add x10, sp, #320
+; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #160
+; CHECK-NEXT:    add x12, sp, #192
+; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #304
+; CHECK-NEXT:    add x11, sp, #328
+; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    ldr b5, [sp, #16]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #168
+; CHECK-NEXT:    ldr b6, [sp, #680]
 ; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #40
-; CHECK-NEXT:    ld1 { v1.b }[3], [x11]
+; CHECK-NEXT:    add x9, sp, #104
+; CHECK-NEXT:    ldr b7, [sp, #480]
 ; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    add x10, sp, #112
+; CHECK-NEXT:    add x8, sp, #312
+; CHECK-NEXT:    fmov s3, w0
+; CHECK-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #176
+; CHECK-NEXT:    ldr b19, [sp, #552]
+; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    ldr b22, [sp, #744]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #336
+; CHECK-NEXT:    mov v3.b[1], w1
+; CHECK-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    ldr b23, [sp, #544]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #224
+; CHECK-NEXT:    ldr b20, [sp, #352]
+; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x9]
+; CHECK-NEXT:    add x10, sp, #120
+; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #128
+; CHECK-NEXT:    add x9, sp, #136
+; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #232
+; CHECK-NEXT:    mov v3.b[2], w2
+; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x12]
 ; CHECK-NEXT:    add x11, sp, #240
-; CHECK-NEXT:    add x13, sp, #56
-; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #48
-; CHECK-NEXT:    ld1 { v3.b }[3], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
-; CHECK-NEXT:    add x15, sp, #312
-; CHECK-NEXT:    add x12, sp, #120
+; CHECK-NEXT:    add x12, sp, #24
+; CHECK-NEXT:    ld1 { v1.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #200
+; CHECK-NEXT:    ld1 { v5.b }[1], [x12]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #256
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #688
+; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
+; CHECK-NEXT:    add x11, sp, #32
 ; CHECK-NEXT:    add x8, sp, #248
-; CHECK-NEXT:    add x11, sp, #64
-; CHECK-NEXT:    ld1 { v2.b }[4], [x15]
-; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
-; CHECK-NEXT:    add x15, sp, #320
-; CHECK-NEXT:    ld1 { v1.b }[5], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
-; CHECK-NEXT:    ldr b18, [sp, #552]
-; CHECK-NEXT:    add x14, sp, #128
-; CHECK-NEXT:    add x16, sp, #256
-; CHECK-NEXT:    ldr b16, [sp, #352]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x15]
-; CHECK-NEXT:    add x15, sp, #176
-; CHECK-NEXT:    ld1 { v3.b }[5], [x16]
-; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #560
-; CHECK-NEXT:    ld1 { v0.b }[6], [x14]
-; CHECK-NEXT:    add x16, sp, #360
-; CHECK-NEXT:    ld1 { v4.b }[3], [x15]
-; CHECK-NEXT:    ld1 { v18.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #696
+; CHECK-NEXT:    ldr b21, [sp, #616]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #208
+; CHECK-NEXT:    smull v23.8h, v23.8b, v22.8b
+; CHECK-NEXT:    ld1 { v5.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #704
+; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    add x10, sp, #56
+; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #264
+; CHECK-NEXT:    ldr b22, [sp, #416]
+; CHECK-NEXT:    ld1 { v5.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #488
+; CHECK-NEXT:    mov v3.b[3], w3
+; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #712
+; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x8]
+; CHECK-NEXT:    add x9, sp, #720
+; CHECK-NEXT:    add x8, sp, #64
+; CHECK-NEXT:    ld1 { v5.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #496
+; CHECK-NEXT:    add x11, sp, #576
+; CHECK-NEXT:    ld1 { v7.b }[2], [x10]
 ; CHECK-NEXT:    add x10, sp, #72
-; CHECK-NEXT:    ld1 { v16.b }[1], [x16]
-; CHECK-NEXT:    add x9, sp, #136
-; CHECK-NEXT:    add x14, sp, #184
-; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
+; CHECK-NEXT:    mov v3.b[4], w4
+; CHECK-NEXT:    ld1 { v6.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #272
+; CHECK-NEXT:    ldr b16, [sp, #344]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #728
+; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #504
+; CHECK-NEXT:    ldr b17, [sp, #144]
+; CHECK-NEXT:    sshll v23.4s, v23.4h, #0
+; CHECK-NEXT:    ld1 { v6.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
+; CHECK-NEXT:    add x8, sp, #736
+; CHECK-NEXT:    add x9, sp, #512
+; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
 ; CHECK-NEXT:    add x10, sp, #568
-; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[4], [x14]
+; CHECK-NEXT:    mov v3.b[5], w5
+; CHECK-NEXT:    smull v16.8h, v17.8b, v16.8b
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v6.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #560
+; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v19.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #360
+; CHECK-NEXT:    add x9, sp, #424
+; CHECK-NEXT:    ld1 { v20.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #624
+; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v21.b }[1], [x8]
 ; CHECK-NEXT:    add x9, sp, #368
-; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
-; CHECK-NEXT:    add x11, sp, #496
-; CHECK-NEXT:    ld1 { v16.b }[2], [x9]
-; CHECK-NEXT:    fmov s5, w0
-; CHECK-NEXT:    add x9, sp, #192
-; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
-; CHECK-NEXT:    add x10, sp, #576
-; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #376
-; CHECK-NEXT:    ld1 { v18.b }[3], [x10]
-; CHECK-NEXT:    add x11, sp, #504
-; CHECK-NEXT:    ld1 { v16.b }[3], [x9]
-; CHECK-NEXT:    mov v5.b[1], w1
-; CHECK-NEXT:    ldr b7, [sp, #144]
-; CHECK-NEXT:    ldr b17, [sp, #344]
-; CHECK-NEXT:    add x9, sp, #200
-; CHECK-NEXT:    ld1 { v6.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #520
+; CHECK-NEXT:    ld1 { v19.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #432
+; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #632
+; CHECK-NEXT:    ld1 { v22.b }[2], [x10]
+; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
+; CHECK-NEXT:    add x8, sp, #376
+; CHECK-NEXT:    add x9, sp, #440
+; CHECK-NEXT:    ld1 { v19.b }[3], [x11]
 ; CHECK-NEXT:    add x10, sp, #584
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #384
-; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
-; CHECK-NEXT:    add x11, sp, #512
-; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
 ; CHECK-NEXT:    add x11, sp, #592
-; CHECK-NEXT:    mov v5.b[2], w2
-; CHECK-NEXT:    add x10, sp, #392
-; CHECK-NEXT:    ldr b19, [sp, #680]
-; CHECK-NEXT:    ld1 { v18.b }[5], [x11]
-; CHECK-NEXT:    smull v7.4s, v7.4h, v17.4h
-; CHECK-NEXT:    ldr b17, [sp, #416]
-; CHECK-NEXT:    ld1 { v16.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #688
-; CHECK-NEXT:    add x12, sp, #328
-; CHECK-NEXT:    add x9, sp, #424
-; CHECK-NEXT:    ld1 { v19.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #600
-; CHECK-NEXT:    ldr b20, [sp, #616]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
-; CHECK-NEXT:    ld1 { v17.b }[1], [x9]
-; CHECK-NEXT:    add x11, sp, #400
-; CHECK-NEXT:    ld1 { v18.b }[6], [x10]
-; CHECK-NEXT:    add x12, sp, #624
-; CHECK-NEXT:    mov v5.b[3], w3
-; CHECK-NEXT:    ld1 { v16.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #696
-; CHECK-NEXT:    ld1 { v20.b }[1], [x12]
-; CHECK-NEXT:    add x9, sp, #432
-; CHECK-NEXT:    ld1 { v19.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #608
-; CHECK-NEXT:    ld1 { v17.b }[2], [x9]
-; CHECK-NEXT:    add x10, sp, #408
-; CHECK-NEXT:    ld1 { v18.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #632
-; CHECK-NEXT:    ld1 { v16.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v20.b }[2], [x11]
-; CHECK-NEXT:    mov v5.b[4], w4
-; CHECK-NEXT:    add x10, sp, #704
-; CHECK-NEXT:    add x12, sp, #440
-; CHECK-NEXT:    ld1 { v19.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v20.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #640
+; CHECK-NEXT:    ld1 { v22.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v21.b }[3], [x8]
+; CHECK-NEXT:    add x9, sp, #384
+; CHECK-NEXT:    add x8, sp, #528
+; CHECK-NEXT:    ld1 { v19.b }[4], [x10]
 ; CHECK-NEXT:    add x10, sp, #448
-; CHECK-NEXT:    ld1 { v17.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #640
-; CHECK-NEXT:    sshll v21.8h, v16.8b, #0
-; CHECK-NEXT:    ld1 { v20.b }[3], [x12]
-; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
-; CHECK-NEXT:    add x11, sp, #712
-; CHECK-NEXT:    mov v5.b[5], w5
-; CHECK-NEXT:    ld1 { v19.b }[4], [x11]
-; CHECK-NEXT:    add x9, sp, #520
-; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #648
-; CHECK-NEXT:    ldr b22, [sp, #544]
-; CHECK-NEXT:    ld1 { v20.b }[4], [x10]
-; CHECK-NEXT:    smull2 v16.4s, v21.8h, v18.8h
-; CHECK-NEXT:    smull v18.4s, v21.4h, v18.4h
-; CHECK-NEXT:    ldr b21, [sp, #744]
-; CHECK-NEXT:    add x11, sp, #720
-; CHECK-NEXT:    ld1 { v6.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #648
+; CHECK-NEXT:    ld1 { v22.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
+; CHECK-NEXT:    add x8, sp, #392
 ; CHECK-NEXT:    add x9, sp, #456
 ; CHECK-NEXT:    ld1 { v19.b }[5], [x11]
-; CHECK-NEXT:    mov v5.b[6], w6
-; CHECK-NEXT:    ld1 { v17.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #656
-; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
-; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
-; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
-; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    add x11, sp, #728
-; CHECK-NEXT:    ld1 { v6.b }[6], [x10]
+; CHECK-NEXT:    mov v3.b[6], w6
+; CHECK-NEXT:    add x10, sp, #600
+; CHECK-NEXT:    ld1 { v20.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #656
+; CHECK-NEXT:    ld1 { v22.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v21.b }[5], [x8]
+; CHECK-NEXT:    add x9, sp, #400
+; CHECK-NEXT:    add x8, sp, #536
+; CHECK-NEXT:    ld1 { v19.b }[6], [x10]
 ; CHECK-NEXT:    add x10, sp, #464
-; CHECK-NEXT:    ld1 { v19.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #664
-; CHECK-NEXT:    ld1 { v17.b }[6], [x10]
-; CHECK-NEXT:    smull v21.4s, v22.4h, v21.4h
-; CHECK-NEXT:    movi v22.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v20.b }[6], [x11]
-; CHECK-NEXT:    mov v5.b[7], w7
-; CHECK-NEXT:    add x9, sp, #536
-; CHECK-NEXT:    add x10, sp, #736
-; CHECK-NEXT:    add x11, sp, #208
-; CHECK-NEXT:    add x13, sp, #264
-; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v19.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v4.b }[7], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #664
+; CHECK-NEXT:    ld1 { v22.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v21.b }[6], [x9]
+; CHECK-NEXT:    add x8, sp, #408
+; CHECK-NEXT:    mov v3.b[7], w7
+; CHECK-NEXT:    sshll v18.4s, v16.4h, #0
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    add x11, sp, #608
+; CHECK-NEXT:    ld1 { v20.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #672
 ; CHECK-NEXT:    add x9, sp, #472
-; CHECK-NEXT:    add x10, sp, #672
-; CHECK-NEXT:    ld1 { v3.b }[6], [x13]
-; CHECK-NEXT:    ld1 { v17.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v20.b }[7], [x10]
-; CHECK-NEXT:    add x8, sp, #336
-; CHECK-NEXT:    mov v22.s[0], v21.s[0]
-; CHECK-NEXT:    movi v21.2d, #0000000000000000
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    sshll v19.8h, v19.8b, #0
-; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #272
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
-; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    smlal v18.4s, v6.4h, v19.4h
-; CHECK-NEXT:    smlal2 v16.4s, v6.8h, v19.8h
-; CHECK-NEXT:    mov v21.s[0], v7.s[0]
-; CHECK-NEXT:    smull v6.4s, v5.4h, v4.4h
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    smlal v22.4s, v17.4h, v20.4h
-; CHECK-NEXT:    smull2 v4.4s, v5.8h, v4.8h
-; CHECK-NEXT:    smlal v21.4s, v1.4h, v3.4h
-; CHECK-NEXT:    smlal2 v16.4s, v17.8h, v20.8h
-; CHECK-NEXT:    smlal v6.4s, v0.4h, v2.4h
-; CHECK-NEXT:    add v5.4s, v18.4s, v22.4s
-; CHECK-NEXT:    smlal2 v4.4s, v0.8h, v2.8h
-; CHECK-NEXT:    add v0.4s, v6.4s, v21.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v16.4s
-; CHECK-NEXT:    smlal2 v4.4s, v1.8h, v3.8h
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    ld1 { v19.b }[7], [x11]
+; CHECK-NEXT:    ld1 { v21.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
+; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    smull v1.8h, v3.8b, v2.8b
+; CHECK-NEXT:    smull v2.8h, v5.8b, v4.8b
+; CHECK-NEXT:    mov v17.s[0], v18.s[0]
+; CHECK-NEXT:    smull v3.8h, v7.8b, v6.8b
+; CHECK-NEXT:    mov v16.s[0], v23.s[0]
+; CHECK-NEXT:    smull v4.8h, v20.8b, v19.8b
+; CHECK-NEXT:    smull v5.8h, v22.8b, v21.8b
+; CHECK-NEXT:    saddl v7.4s, v1.4h, v0.4h
+; CHECK-NEXT:    saddl2 v0.4s, v1.8h, v0.8h
+; CHECK-NEXT:    saddw v6.4s, v17.4s, v2.4h
+; CHECK-NEXT:    saddl v1.4s, v4.4h, v3.4h
+; CHECK-NEXT:    saddl2 v3.4s, v4.8h, v3.8h
+; CHECK-NEXT:    saddw v4.4s, v16.4s, v5.4h
+; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v2.8h
+; CHECK-NEXT:    add v6.4s, v7.4s, v6.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    saddw2 v2.4s, v3.4s, v5.8h
+; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1586,32 +1558,24 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
-; CHECK-NEXT:    ldp q2, q4, [x0]
-; CHECK-NEXT:    ldp q3, q6, [x1]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v5.8h, v2.8b, #0
-; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    ushll2 v16.8h, v4.16b, #0
-; CHECK-NEXT:    ushll v7.8h, v3.8b, #0
-; CHECK-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-NEXT:    umull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    ushll2 v19.8h, v6.16b, #0
-; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
-; CHECK-NEXT:    umull2 v17.4s, v7.8h, v5.8h
-; CHECK-NEXT:    umull2 v18.4s, v3.8h, v2.8h
-; CHECK-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NEXT:    umull v0.4s, v3.4h, v2.4h
-; CHECK-NEXT:    umlal2 v18.4s, v19.8h, v16.8h
-; CHECK-NEXT:    umlal2 v17.4s, v6.8h, v4.8h
-; CHECK-NEXT:    umlal v1.4s, v7.4h, v5.4h
-; CHECK-NEXT:    umlal v0.4s, v19.4h, v16.4h
-; CHECK-NEXT:    add v2.4s, v17.4s, v18.4s
-; CHECK-NEXT:    umlal v1.4s, v6.4h, v4.4h
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ldp q4, q2, [x1]
+; CHECK-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    ldp q3, q1, [x0]
+; CHECK-NEXT:    umull v6.8h, v2.8b, v1.8b
+; CHECK-NEXT:    umull2 v1.8h, v2.16b, v1.16b
+; CHECK-NEXT:    umull v2.8h, v4.8b, v3.8b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    umull2 v3.8h, v4.16b, v3.16b
+; CHECK-NEXT:    mov v5.s[0], v0.s[0]
+; CHECK-NEXT:    uaddl2 v4.4s, v2.8h, v6.8h
+; CHECK-NEXT:    uaddl2 v0.4s, v3.8h, v1.8h
+; CHECK-NEXT:    uaddl v1.4s, v3.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    uaddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    uaddw v2.4s, v2.4s, v6.4h
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1662,32 +1626,24 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
-; CHECK-NEXT:    ldp q2, q4, [x0]
-; CHECK-NEXT:    ldp q3, q6, [x1]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    sshll2 v16.8h, v4.16b, #0
-; CHECK-NEXT:    sshll v7.8h, v3.8b, #0
-; CHECK-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    sshll2 v19.8h, v6.16b, #0
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    smull2 v17.4s, v7.8h, v5.8h
-; CHECK-NEXT:    smull2 v18.4s, v3.8h, v2.8h
-; CHECK-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NEXT:    smull v0.4s, v3.4h, v2.4h
-; CHECK-NEXT:    smlal2 v18.4s, v19.8h, v16.8h
-; CHECK-NEXT:    smlal2 v17.4s, v6.8h, v4.8h
-; CHECK-NEXT:    smlal v1.4s, v7.4h, v5.4h
-; CHECK-NEXT:    smlal v0.4s, v19.4h, v16.4h
-; CHECK-NEXT:    add v2.4s, v17.4s, v18.4s
-; CHECK-NEXT:    smlal v1.4s, v6.4h, v4.4h
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ldp q4, q2, [x1]
+; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    ldp q3, q1, [x0]
+; CHECK-NEXT:    smull v6.8h, v2.8b, v1.8b
+; CHECK-NEXT:    smull2 v1.8h, v2.16b, v1.16b
+; CHECK-NEXT:    smull v2.8h, v4.8b, v3.8b
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    smull2 v3.8h, v4.16b, v3.16b
+; CHECK-NEXT:    mov v5.s[0], v0.s[0]
+; CHECK-NEXT:    saddl2 v4.4s, v2.8h, v6.8h
+; CHECK-NEXT:    saddl2 v0.4s, v3.8h, v1.8h
+; CHECK-NEXT:    saddl v1.4s, v3.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    saddw v2.4s, v2.4s, v6.4h
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1709,291 +1665,275 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    fmov s4, w0
-; CHECK-NEXT:    ldr b0, [sp, #80]
-; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    ldr b1, [sp, #144]
-; CHECK-NEXT:    add x10, sp, #152
-; CHECK-NEXT:    ldr b6, [sp, #16]
+; CHECK-NEXT:    ldr b0, [sp, #344]
+; CHECK-NEXT:    add x8, sp, #352
+; CHECK-NEXT:    ldr b2, [sp, #80]
+; CHECK-NEXT:    add x9, sp, #88
+; CHECK-NEXT:    ldr b3, [sp, #216]
+; CHECK-NEXT:    add x10, sp, #232
 ; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    ldr b2, [sp, #344]
-; CHECK-NEXT:    mov v4.b[1], w1
-; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ld1 { v6.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    add x8, sp, #224
+; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #360
+; CHECK-NEXT:    ld1 { v3.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #96
+; CHECK-NEXT:    add x11, sp, #376
+; CHECK-NEXT:    ldr b4, [sp, #408]
+; CHECK-NEXT:    add x12, sp, #384
 ; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    add x10, sp, #32
-; CHECK-NEXT:    add x11, sp, #112
-; CHECK-NEXT:    mov v4.b[2], w2
-; CHECK-NEXT:    ld1 { v6.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #168
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-NEXT:    ldr b5, [sp, #216]
-; CHECK-NEXT:    add x13, sp, #224
-; CHECK-NEXT:    ld1 { v1.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #40
-; CHECK-NEXT:    add x12, sp, #120
-; CHECK-NEXT:    ld1 { v6.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #176
-; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
-; CHECK-NEXT:    mov v4.b[3], w3
-; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #48
-; CHECK-NEXT:    add x8, sp, #360
-; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
-; CHECK-NEXT:    add x13, sp, #56
-; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
-; CHECK-NEXT:    ldr b7, [sp, #280]
+; CHECK-NEXT:    add x9, sp, #368
 ; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
-; CHECK-NEXT:    add x15, sp, #232
+; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
+; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    add x14, sp, #248
+; CHECK-NEXT:    add x10, sp, #392
+; CHECK-NEXT:    ldr b5, [sp, #144]
+; CHECK-NEXT:    ldr b6, [sp, #280]
+; CHECK-NEXT:    ld1 { v0.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #240
+; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v3.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    add x8, sp, #400
+; CHECK-NEXT:    add x13, sp, #128
+; CHECK-NEXT:    ldr b17, [sp, #744]
+; CHECK-NEXT:    ldr b19, [sp, #480]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #416
+; CHECK-NEXT:    ld1 { v4.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v3.b }[4], [x14]
+; CHECK-NEXT:    add x11, sp, #120
+; CHECK-NEXT:    add x9, sp, #136
+; CHECK-NEXT:    ldr b21, [sp, #936]
+; CHECK-NEXT:    ldr b22, [sp, #672]
 ; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
-; CHECK-NEXT:    add x14, sp, #184
-; CHECK-NEXT:    mov v4.b[4], w4
-; CHECK-NEXT:    ld1 { v5.b }[2], [x15]
-; CHECK-NEXT:    add x9, sp, #128
-; CHECK-NEXT:    ld1 { v6.b }[5], [x13]
-; CHECK-NEXT:    add x13, sp, #288
-; CHECK-NEXT:    add x10, sp, #368
-; CHECK-NEXT:    ld1 { v7.b }[1], [x13]
-; CHECK-NEXT:    ld1 { v1.b }[5], [x14]
-; CHECK-NEXT:    ld1 { v2.b }[3], [x10]
-; CHECK-NEXT:    add x15, sp, #240
-; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #296
-; CHECK-NEXT:    mov v4.b[5], w5
-; CHECK-NEXT:    add x11, sp, #192
-; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
-; CHECK-NEXT:    ldr b3, [sp, #408]
-; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
-; CHECK-NEXT:    add x12, sp, #64
-; CHECK-NEXT:    add x13, sp, #376
-; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #416
-; CHECK-NEXT:    ld1 { v6.b }[6], [x12]
-; CHECK-NEXT:    add x12, sp, #248
-; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
-; CHECK-NEXT:    mov v4.b[6], w6
-; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
-; CHECK-NEXT:    add x11, sp, #304
-; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
-; CHECK-NEXT:    ld1 { v7.b }[3], [x11]
-; CHECK-NEXT:    add x8, sp, #136
-; CHECK-NEXT:    add x15, sp, #384
-; CHECK-NEXT:    add x9, sp, #424
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x15]
-; CHECK-NEXT:    add x8, sp, #312
-; CHECK-NEXT:    mov v4.b[7], w7
-; CHECK-NEXT:    add x9, sp, #256
-; CHECK-NEXT:    add x10, sp, #200
-; CHECK-NEXT:    ld1 { v7.b }[4], [x8]
-; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
-; CHECK-NEXT:    add x14, sp, #72
-; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #432
-; CHECK-NEXT:    add x8, sp, #392
-; CHECK-NEXT:    ld1 { v6.b }[7], [x14]
-; CHECK-NEXT:    ld1 { v3.b }[3], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #320
-; CHECK-NEXT:    add x9, sp, #264
-; CHECK-NEXT:    sshll v21.8h, v4.8b, #0
-; CHECK-NEXT:    ldr b4, [sp, #208]
-; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x11]
+; CHECK-NEXT:    add x11, sp, #424
+; CHECK-NEXT:    add x12, sp, #256
+; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #152
+; CHECK-NEXT:    ld1 { v3.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #432
+; CHECK-NEXT:    ld1 { v0.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #264
+; CHECK-NEXT:    ld1 { v2.b }[6], [x13]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x11]
+; CHECK-NEXT:    add x11, sp, #160
+; CHECK-NEXT:    ldr b7, [sp, #472]
+; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
 ; CHECK-NEXT:    add x10, sp, #440
-; CHECK-NEXT:    add x8, sp, #400
-; CHECK-NEXT:    sshll v16.8h, v6.8b, #0
-; CHECK-NEXT:    sshll v6.8h, v4.8b, #0
-; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #288
+; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
 ; CHECK-NEXT:    add x8, sp, #272
-; CHECK-NEXT:    add x9, sp, #328
-; CHECK-NEXT:    ldr b4, [sp, #608]
-; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #616
-; CHECK-NEXT:    add x10, sp, #448
-; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
-; CHECK-NEXT:    ldr b18, [sp, #480]
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
-; CHECK-NEXT:    add x9, sp, #336
-; CHECK-NEXT:    ldr b17, [sp, #472]
-; CHECK-NEXT:    add x8, sp, #488
-; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #624
-; CHECK-NEXT:    ld1 { v18.b }[1], [x8]
-; CHECK-NEXT:    sshll v22.8h, v5.8b, #0
-; CHECK-NEXT:    add x8, sp, #456
-; CHECK-NEXT:    sshll v5.8h, v17.8b, #0
-; CHECK-NEXT:    ld1 { v4.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #496
-; CHECK-NEXT:    sshll v17.8h, v7.8b, #0
-; CHECK-NEXT:    add x10, sp, #632
-; CHECK-NEXT:    ld1 { v18.b }[2], [x8]
-; CHECK-NEXT:    add x9, sp, #464
-; CHECK-NEXT:    add x8, sp, #504
-; CHECK-NEXT:    smull v19.4s, v6.4h, v5.4h
-; CHECK-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v4.b }[3], [x10]
-; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
-; CHECK-NEXT:    smull v6.4s, v16.4h, v17.4h
-; CHECK-NEXT:    add x9, sp, #640
-; CHECK-NEXT:    ld1 { v18.b }[3], [x8]
-; CHECK-NEXT:    smull2 v16.4s, v16.8h, v17.8h
-; CHECK-NEXT:    ldr b17, [sp, #672]
-; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #680
-; CHECK-NEXT:    ldr b20, [sp, #544]
-; CHECK-NEXT:    mov v5.s[0], v19.s[0]
-; CHECK-NEXT:    add x8, sp, #512
-; CHECK-NEXT:    ld1 { v17.b }[1], [x9]
-; CHECK-NEXT:    add x11, sp, #552
-; CHECK-NEXT:    add x10, sp, #648
-; CHECK-NEXT:    ld1 { v18.b }[4], [x8]
-; CHECK-NEXT:    ld1 { v20.b }[1], [x11]
-; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #688
-; CHECK-NEXT:    add x9, sp, #520
-; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #560
-; CHECK-NEXT:    smull2 v7.4s, v21.8h, v22.8h
-; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
-; CHECK-NEXT:    smlal v5.4s, v21.4h, v22.4h
-; CHECK-NEXT:    ld1 { v20.b }[2], [x10]
-; CHECK-NEXT:    ldr b21, [sp, #736]
-; CHECK-NEXT:    ldr b22, [sp, #1000]
-; CHECK-NEXT:    add x8, sp, #656
-; CHECK-NEXT:    add x9, sp, #696
-; CHECK-NEXT:    add x11, sp, #568
-; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #528
-; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
-; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
-; CHECK-NEXT:    sshll v24.8h, v22.8b, #0
-; CHECK-NEXT:    ld1 { v18.b }[6], [x8]
-; CHECK-NEXT:    ld1 { v20.b }[3], [x11]
-; CHECK-NEXT:    add x10, sp, #704
-; CHECK-NEXT:    ldr b23, [sp, #808]
-; CHECK-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-NEXT:    add x9, sp, #536
-; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #576
-; CHECK-NEXT:    ldr b22, [sp, #744]
-; CHECK-NEXT:    add x11, sp, #816
-; CHECK-NEXT:    smull v24.4s, v21.4h, v24.4h
-; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v20.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #296
+; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #448
+; CHECK-NEXT:    add x10, sp, #176
+; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
+; CHECK-NEXT:    add x8, sp, #304
+; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
+; CHECK-NEXT:    add x9, sp, #456
+; CHECK-NEXT:    add x10, sp, #184
+; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    ldr b16, [sp, #208]
+; CHECK-NEXT:    add x12, sp, #784
+; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #312
+; CHECK-NEXT:    ld1 { v5.b }[5], [x10]
 ; CHECK-NEXT:    add x10, sp, #752
-; CHECK-NEXT:    ld1 { v23.b }[1], [x11]
-; CHECK-NEXT:    add x9, sp, #712
+; CHECK-NEXT:    smull v7.8h, v16.8b, v7.8b
+; CHECK-NEXT:    ld1 { v17.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #760
+; CHECK-NEXT:    ldr b16, [sp, #16]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #320
+; CHECK-NEXT:    ldr b18, [sp, #1000]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #768
+; CHECK-NEXT:    ldr b20, [sp, #736]
+; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #680
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    ld1 { v6.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #488
 ; CHECK-NEXT:    ld1 { v22.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v17.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #584
-; CHECK-NEXT:    add x10, sp, #824
-; CHECK-NEXT:    sshll v21.8h, v18.8b, #0
-; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #760
-; CHECK-NEXT:    ldr b18, [sp, #936]
-; CHECK-NEXT:    ld1 { v23.b }[2], [x10]
-; CHECK-NEXT:    mov v19.s[0], v24.s[0]
-; CHECK-NEXT:    ldr b24, [sp, #872]
-; CHECK-NEXT:    ld1 { v22.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v19.b }[1], [x9]
 ; CHECK-NEXT:    add x9, sp, #944
-; CHECK-NEXT:    add x11, sp, #880
-; CHECK-NEXT:    add x10, sp, #768
-; CHECK-NEXT:    ld1 { v18.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #832
-; CHECK-NEXT:    ld1 { v24.b }[1], [x11]
-; CHECK-NEXT:    ld1 { v23.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #952
-; CHECK-NEXT:    add x12, sp, #888
-; CHECK-NEXT:    add x9, sp, #592
+; CHECK-NEXT:    add x10, sp, #688
+; CHECK-NEXT:    ld1 { v21.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #496
+; CHECK-NEXT:    ld1 { v17.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v22.b }[2], [x10]
 ; CHECK-NEXT:    add x11, sp, #776
-; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #840
-; CHECK-NEXT:    ld1 { v24.b }[2], [x12]
-; CHECK-NEXT:    ld1 { v23.b }[4], [x10]
-; CHECK-NEXT:    ld1 { v22.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #960
-; CHECK-NEXT:    add x11, sp, #896
-; CHECK-NEXT:    add x10, sp, #784
-; CHECK-NEXT:    ld1 { v18.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #848
-; CHECK-NEXT:    ld1 { v24.b }[3], [x11]
-; CHECK-NEXT:    ld1 { v23.b }[5], [x9]
-; CHECK-NEXT:    ld1 { v22.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #968
-; CHECK-NEXT:    add x12, sp, #904
-; CHECK-NEXT:    add x9, sp, #600
+; CHECK-NEXT:    add x10, sp, #504
+; CHECK-NEXT:    ld1 { v19.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #952
+; CHECK-NEXT:    smull v20.8h, v20.8b, v18.8b
+; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #696
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ld1 { v22.b }[3], [x11]
 ; CHECK-NEXT:    add x11, sp, #792
-; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #856
-; CHECK-NEXT:    ld1 { v24.b }[4], [x12]
-; CHECK-NEXT:    ld1 { v23.b }[6], [x10]
-; CHECK-NEXT:    ld1 { v22.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #976
-; CHECK-NEXT:    add x11, sp, #912
-; CHECK-NEXT:    add x10, sp, #800
-; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #864
-; CHECK-NEXT:    ld1 { v24.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v23.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #720
-; CHECK-NEXT:    ld1 { v22.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #960
+; CHECK-NEXT:    ld1 { v16.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v21.b }[3], [x10]
+; CHECK-NEXT:    add x9, sp, #512
+; CHECK-NEXT:    ld1 { v17.b }[5], [x12]
+; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    add x12, sp, #800
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v19.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #968
+; CHECK-NEXT:    ld1 { v22.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
+; CHECK-NEXT:    add x10, sp, #520
+; CHECK-NEXT:    ld1 { v17.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #712
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    sshll v23.4s, v20.4h, #0
+; CHECK-NEXT:    ld1 { v19.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #976
+; CHECK-NEXT:    ld1 { v22.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v21.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #528
+; CHECK-NEXT:    add x11, sp, #720
+; CHECK-NEXT:    ld1 { v16.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    ld1 { v17.b }[7], [x12]
+; CHECK-NEXT:    ld1 { v19.b }[6], [x10]
 ; CHECK-NEXT:    add x10, sp, #984
-; CHECK-NEXT:    ld1 { v17.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v21.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #992
+; CHECK-NEXT:    add x11, sp, #728
+; CHECK-NEXT:    mov v1.b[1], w1
+; CHECK-NEXT:    ldr b20, [sp, #872]
+; CHECK-NEXT:    mov v18.s[0], v23.s[0]
+; CHECK-NEXT:    ld1 { v19.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[7], [x11]
+; CHECK-NEXT:    add x9, sp, #328
+; CHECK-NEXT:    ld1 { v21.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #40
+; CHECK-NEXT:    ldr b23, [sp, #608]
+; CHECK-NEXT:    ld1 { v16.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #816
+; CHECK-NEXT:    add x11, sp, #552
+; CHECK-NEXT:    smull v17.8h, v19.8b, v17.8b
+; CHECK-NEXT:    ld1 { v6.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #880
+; CHECK-NEXT:    smull v19.8h, v22.8b, v21.8b
+; CHECK-NEXT:    ldr b21, [sp, #808]
+; CHECK-NEXT:    ldr b22, [sp, #544]
+; CHECK-NEXT:    add x12, sp, #616
+; CHECK-NEXT:    mov v1.b[2], w2
+; CHECK-NEXT:    ld1 { v20.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v21.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[1], [x12]
+; CHECK-NEXT:    add x11, sp, #824
+; CHECK-NEXT:    add x12, sp, #560
+; CHECK-NEXT:    add x9, sp, #888
+; CHECK-NEXT:    add x13, sp, #624
+; CHECK-NEXT:    add x10, sp, #48
+; CHECK-NEXT:    ld1 { v20.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v21.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v22.b }[2], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[2], [x13]
+; CHECK-NEXT:    mov v1.b[3], w3
+; CHECK-NEXT:    ld1 { v16.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #832
+; CHECK-NEXT:    add x11, sp, #568
+; CHECK-NEXT:    add x9, sp, #896
+; CHECK-NEXT:    add x12, sp, #632
+; CHECK-NEXT:    ld1 { v21.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v23.b }[3], [x12]
+; CHECK-NEXT:    add x11, sp, #840
+; CHECK-NEXT:    add x12, sp, #576
+; CHECK-NEXT:    mov v1.b[4], w4
+; CHECK-NEXT:    add x9, sp, #904
+; CHECK-NEXT:    add x13, sp, #640
+; CHECK-NEXT:    ld1 { v21.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v22.b }[4], [x12]
+; CHECK-NEXT:    add x10, sp, #56
+; CHECK-NEXT:    ld1 { v20.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v23.b }[4], [x13]
+; CHECK-NEXT:    ld1 { v16.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #848
+; CHECK-NEXT:    add x11, sp, #584
+; CHECK-NEXT:    add x9, sp, #912
+; CHECK-NEXT:    add x12, sp, #648
+; CHECK-NEXT:    ld1 { v21.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[5], [x11]
+; CHECK-NEXT:    mov v1.b[5], w5
+; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v23.b }[5], [x12]
+; CHECK-NEXT:    add x11, sp, #856
+; CHECK-NEXT:    add x12, sp, #592
 ; CHECK-NEXT:    add x9, sp, #920
-; CHECK-NEXT:    ld1 { v18.b }[6], [x10]
-; CHECK-NEXT:    ld1 { v24.b }[6], [x9]
-; CHECK-NEXT:    add x10, sp, #728
-; CHECK-NEXT:    add x8, sp, #664
-; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
-; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
-; CHECK-NEXT:    sshll v23.8h, v23.8b, #0
-; CHECK-NEXT:    add x9, sp, #992
-; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #928
-; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
+; CHECK-NEXT:    add x13, sp, #656
+; CHECK-NEXT:    ld1 { v21.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x12]
+; CHECK-NEXT:    add x10, sp, #64
+; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v23.b }[6], [x13]
+; CHECK-NEXT:    mov v1.b[6], w6
+; CHECK-NEXT:    ld1 { v16.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #864
+; CHECK-NEXT:    add x11, sp, #600
+; CHECK-NEXT:    add x9, sp, #928
+; CHECK-NEXT:    add x12, sp, #664
+; CHECK-NEXT:    ld1 { v21.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[7], [x11]
+; CHECK-NEXT:    add x8, sp, #464
+; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v23.b }[7], [x12]
 ; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[7], [x10]
-; CHECK-NEXT:    smlal v19.4s, v21.4h, v22.4h
-; CHECK-NEXT:    smull2 v21.4s, v21.8h, v22.8h
-; CHECK-NEXT:    smull v22.4s, v20.4h, v23.4h
-; CHECK-NEXT:    smull2 v20.4s, v20.8h, v23.8h
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
-; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v23.8h, v24.8b, #0
-; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v3.8h
-; CHECK-NEXT:    smlal v6.4s, v1.4h, v3.4h
-; CHECK-NEXT:    smlal2 v7.4s, v0.8h, v2.8h
-; CHECK-NEXT:    smlal v5.4s, v0.4h, v2.4h
-; CHECK-NEXT:    smlal2 v20.4s, v17.8h, v18.8h
-; CHECK-NEXT:    smlal v22.4s, v17.4h, v18.4h
-; CHECK-NEXT:    smlal2 v21.4s, v4.8h, v23.8h
-; CHECK-NEXT:    smlal v19.4s, v4.4h, v23.4h
-; CHECK-NEXT:    add v0.4s, v7.4s, v16.4s
-; CHECK-NEXT:    add v1.4s, v5.4s, v6.4s
-; CHECK-NEXT:    add v2.4s, v21.4s, v20.4s
-; CHECK-NEXT:    add v3.4s, v19.4s, v22.4s
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add x8, sp, #200
+; CHECK-NEXT:    mov v1.b[7], w7
+; CHECK-NEXT:    add x10, sp, #336
+; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #72
+; CHECK-NEXT:    smull v21.8h, v22.8b, v21.8b
+; CHECK-NEXT:    movi v22.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v6.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v16.b }[7], [x8]
+; CHECK-NEXT:    smull v20.8h, v23.8b, v20.8b
+; CHECK-NEXT:    sshll v7.4s, v7.4h, #0
+; CHECK-NEXT:    smull v0.8h, v2.8b, v0.8b
+; CHECK-NEXT:    saddw v2.4s, v18.4s, v17.4h
+; CHECK-NEXT:    smull v1.8h, v1.8b, v3.8b
+; CHECK-NEXT:    smull v3.8h, v5.8b, v4.8b
+; CHECK-NEXT:    smull v4.8h, v16.8b, v6.8b
+; CHECK-NEXT:    saddl2 v5.4s, v21.8h, v19.8h
+; CHECK-NEXT:    mov v22.s[0], v7.s[0]
+; CHECK-NEXT:    saddl v7.4s, v21.4h, v19.4h
+; CHECK-NEXT:    saddl2 v6.4s, v17.8h, v20.8h
+; CHECK-NEXT:    saddw v2.4s, v2.4s, v20.4h
+; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
+; CHECK-NEXT:    saddl2 v16.4s, v4.8h, v3.8h
+; CHECK-NEXT:    saddl v3.4s, v4.4h, v3.4h
+; CHECK-NEXT:    saddw v1.4s, v22.4s, v1.4h
+; CHECK-NEXT:    add v5.4s, v6.4s, v5.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    add v6.4s, v17.4s, v16.4s
+; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    add v1.4s, v2.4s, v5.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll
new file mode 100644
index 0000000..3dbc033
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll
@@ -0,0 +1,451 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple aarch64 -o - -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define <8 x i32> @extmuls_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extmuls_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuls_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    smull v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i32>
+  %s1s = sext <8 x i8> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @extmulu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extmulu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmulu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    umull v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    umull2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i32>
+  %s1s = zext <8 x i8> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @extmulsu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extmulsu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v2.8h, v1.8b, #0
+; CHECK-SD-NEXT:    smull2 v1.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmulsu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i32>
+  %s1s = zext <8 x i8> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @extmuladds_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1, <8 x i32> %b) {
+; CHECK-SD-LABEL: extmuladds_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    saddw2 v1.4s, v3.4s, v0.8h
+; CHECK-SD-NEXT:    saddw v0.4s, v2.4s, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuladds_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    smlal v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smlal2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i32>
+  %s1s = sext <8 x i8> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  %a = add <8 x i32> %m, %b
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @extmuladdu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1, <8 x i32> %b) {
+; CHECK-SD-LABEL: extmuladdu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddw2 v1.4s, v3.4s, v0.8h
+; CHECK-SD-NEXT:    uaddw v0.4s, v2.4s, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuladdu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    umlal v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umlal2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i32>
+  %s1s = zext <8 x i8> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  %a = add <8 x i32> %m, %b
+  ret <8 x i32> %a
+}
+
+define <8 x i32> @extmuladdsu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1, <8 x i32> %b) {
+; CHECK-SD-LABEL: extmuladdsu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    smlal2 v3.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    smlal v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuladdsu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    mla v2.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    mla v3.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i32>
+  %s1s = zext <8 x i8> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  %a = add <8 x i32> %m, %b
+  ret <8 x i32> %a
+}
+
+
+
+define <8 x i64> @extmuls_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extmuls_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuls_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    smull v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    smull2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    smull v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    smull2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i64>
+  %s1s = sext <8 x i8> %s1 to <8 x i64>
+  %m = mul <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <8 x i64> @extmulu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extmulu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmulu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    umull v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    umull2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    umull v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    umull2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i64>
+  %s1s = zext <8 x i8> %s1 to <8 x i64>
+  %m = mul <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extaddsu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v1.4h, #0
+; CHECK-SD-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v6.4s, v1.8h, #0
+; CHECK-SD-NEXT:    smull v0.2d, v2.2s, v4.2s
+; CHECK-SD-NEXT:    smull2 v1.2d, v2.4s, v4.4s
+; CHECK-SD-NEXT:    smull2 v3.2d, v5.4s, v6.4s
+; CHECK-SD-NEXT:    smull v2.2d, v5.2s, v6.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddsu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v2.2d, v2.4s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    fmov x9, d5
+; CHECK-GI-NEXT:    mov x12, v4.d[1]
+; CHECK-GI-NEXT:    fmov x10, d3
+; CHECK-GI-NEXT:    fmov x11, d7
+; CHECK-GI-NEXT:    mov x13, v5.d[1]
+; CHECK-GI-NEXT:    fmov x14, d1
+; CHECK-GI-NEXT:    mov x15, v2.d[1]
+; CHECK-GI-NEXT:    mov x16, v3.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov x17, v7.d[1]
+; CHECK-GI-NEXT:    mov x18, v1.d[1]
+; CHECK-GI-NEXT:    mul x12, x12, x13
+; CHECK-GI-NEXT:    mov x13, v0.d[1]
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    fmov x10, d6
+; CHECK-GI-NEXT:    mul x15, x15, x16
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x11, d0
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mul x13, x13, x18
+; CHECK-GI-NEXT:    mov v0.d[1], x12
+; CHECK-GI-NEXT:    mul x11, x11, x14
+; CHECK-GI-NEXT:    mov x14, v6.d[1]
+; CHECK-GI-NEXT:    mov v1.d[1], x15
+; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    mul x14, x14, x17
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov v3.d[1], x13
+; CHECK-GI-NEXT:    mov v2.d[1], x14
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i64>
+  %s1s = zext <8 x i8> %s1 to <8 x i64>
+  %m = mul <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <8 x i64> @extmuladds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b) {
+; CHECK-SD-LABEL: extmuladds_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddw2 v5.2d, v5.2d, v6.4s
+; CHECK-SD-NEXT:    saddw v0.2d, v2.2d, v1.2s
+; CHECK-SD-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-SD-NEXT:    saddw v2.2d, v4.2d, v6.2s
+; CHECK-SD-NEXT:    mov v3.16b, v5.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuladds_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    smlal v2.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    smlal2 v3.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    smlal v4.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smlal2 v5.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v2.16b, v4.16b
+; CHECK-GI-NEXT:    mov v3.16b, v5.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i64>
+  %s1s = sext <8 x i8> %s1 to <8 x i64>
+  %m = mul <8 x i64> %s0s, %s1s
+  %a = add <8 x i64> %m, %b
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @extmuladdu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b) {
+; CHECK-SD-LABEL: extmuladdu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddw2 v5.2d, v5.2d, v6.4s
+; CHECK-SD-NEXT:    uaddw v0.2d, v2.2d, v1.2s
+; CHECK-SD-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-SD-NEXT:    uaddw v2.2d, v4.2d, v6.2s
+; CHECK-SD-NEXT:    mov v3.16b, v5.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuladdu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    umlal v2.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    umlal2 v3.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    umlal v4.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umlal2 v5.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v2.16b, v4.16b
+; CHECK-GI-NEXT:    mov v3.16b, v5.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i64>
+  %s1s = zext <8 x i8> %s1 to <8 x i64>
+  %m = mul <8 x i64> %s0s, %s1s
+  %a = add <8 x i64> %m, %b
+  ret <8 x i64> %a
+}
+
+define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b) {
+; CHECK-SD-LABEL: extmuladdsu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-SD-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-SD-NEXT:    smlal v2.2d, v6.2s, v7.2s
+; CHECK-SD-NEXT:    smlal2 v3.2d, v6.4s, v7.4s
+; CHECK-SD-NEXT:    smlal2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smlal v4.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    mov v2.16b, v4.16b
+; CHECK-SD-NEXT:    mov v3.16b, v5.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extmuladdsu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll v17.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v6.2d, v6.4s, #0
+; CHECK-GI-NEXT:    ushll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    sshll v18.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v19.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    fmov x8, d16
+; CHECK-GI-NEXT:    fmov x9, d17
+; CHECK-GI-NEXT:    mov x12, v16.d[1]
+; CHECK-GI-NEXT:    fmov x10, d7
+; CHECK-GI-NEXT:    fmov x11, d19
+; CHECK-GI-NEXT:    mov x13, v17.d[1]
+; CHECK-GI-NEXT:    fmov x14, d1
+; CHECK-GI-NEXT:    mov x15, v6.d[1]
+; CHECK-GI-NEXT:    mov x16, v7.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x9, d6
+; CHECK-GI-NEXT:    mov x17, v19.d[1]
+; CHECK-GI-NEXT:    mov x18, v1.d[1]
+; CHECK-GI-NEXT:    mul x12, x12, x13
+; CHECK-GI-NEXT:    mov x13, v0.d[1]
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    fmov x10, d18
+; CHECK-GI-NEXT:    mul x15, x15, x16
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x11, d0
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mul x13, x13, x18
+; CHECK-GI-NEXT:    mov v0.d[1], x12
+; CHECK-GI-NEXT:    mul x11, x11, x14
+; CHECK-GI-NEXT:    mov x14, v18.d[1]
+; CHECK-GI-NEXT:    mov v1.d[1], x15
+; CHECK-GI-NEXT:    fmov d6, x10
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mul x14, x14, x17
+; CHECK-GI-NEXT:    add v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    fmov d7, x11
+; CHECK-GI-NEXT:    mov v7.d[1], x13
+; CHECK-GI-NEXT:    mov v6.d[1], x14
+; CHECK-GI-NEXT:    add v3.2d, v7.2d, v5.2d
+; CHECK-GI-NEXT:    add v2.2d, v6.2d, v4.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i64>
+  %s1s = zext <8 x i8> %s1 to <8 x i64>
+  %m = mul <8 x i64> %s0s, %s1s
+  %a = add <8 x i64> %m, %b
+  ret <8 x i64> %a
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index ce0c557..7d23e87 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -15,7 +15,7 @@ declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [40 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vtanq_f64, ptr @armpl_vtanq_f32, ptr @armpl_svtan_f64_x, ptr @armpl_svtan_f32_x, ptr @armpl_vfmodq_f64, ptr @armpl_vfmodq_f32, ptr @armpl_svfmod_f64_x, ptr @armpl_svfmod_f32_x], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [36 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vtanq_f64, ptr @armpl_vtanq_f32, ptr @armpl_svtan_f64_x, ptr @armpl_svtan_f32_x], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_cos_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @llvm_cos_f64
@@ -469,46 +469,6 @@ define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) #0 {
   ret <vscale x 4 x float> %1
 }
 
-define <2 x double> @frem_f64(<2 x double> %in) {
-; CHECK-LABEL: define <2 x double> @frem_f64
-; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @armpl_vfmodq_f64(<2 x double> [[IN]], <2 x double> [[IN]])
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1= frem <2 x double> %in, %in
-  ret <2 x double> %1
-}
-
-define <4 x float> @frem_f32(<4 x float> %in) {
-; CHECK-LABEL: define <4 x float> @frem_f32
-; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @armpl_vfmodq_f32(<4 x float> [[IN]], <4 x float> [[IN]])
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1= frem <4 x float> %in, %in
-  ret <4 x float> %1
-}
-
-define <vscale x 2 x double> @frem_vscale_f64(<vscale x 2 x double> %in) #0 {
-; CHECK-LABEL: define <vscale x 2 x double> @frem_vscale_f64
-; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @armpl_svfmod_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
-;
-  %1= frem <vscale x 2 x double> %in, %in
-  ret <vscale x 2 x double> %1
-}
-
-define <vscale x 4 x float> @frem_vscale_f32(<vscale x 4 x float> %in) #0 {
-; CHECK-LABEL: define <vscale x 4 x float> @frem_vscale_f32
-; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @armpl_svfmod_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
-;
-  %1= frem <vscale x 4 x float> %in, %in
-  ret <vscale x 4 x float> %1
-}
-
 attributes #0 = { "target-features"="+sve" }
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
index c58a9bc..15d100a 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef-scalable.ll
@@ -4,7 +4,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [20 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf, ptr @_ZGVsMxvv_fmod, ptr @_ZGVsMxvv_fmodf], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf], section "llvm.metadata"
 ;.
 define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_vscale_f64(
@@ -403,24 +403,6 @@ define <vscale x 4 x float> @llvm_trunc_vscale_f32(<vscale x 4 x float> %in) {
   ret <vscale x 4 x float> %1
 }
 
-define <vscale x 2 x double> @frem_f64(<vscale x 2 x double> %in) {
-; CHECK-LABEL: @frem_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmod(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
-;
-  %1= frem <vscale x 2 x double> %in, %in
-  ret <vscale x 2 x double> %1
-}
-
-define <vscale x 4 x float> @frem_f32(<vscale x 4 x float> %in) {
-; CHECK-LABEL: @frem_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmodf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
-;
-  %1= frem <vscale x 4 x float> %in, %in
-  ret <vscale x 4 x float> %1
-}
-
 declare <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
index 1df2cae..a3da3b8 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-sleef.ll
@@ -4,7 +4,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [20 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf, ptr @_ZGVnN2vv_fmod, ptr @_ZGVnN4vv_fmodf], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [18 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_ceil_f64(<2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_f64(
@@ -402,24 +402,6 @@ define <4 x float> @llvm_trunc_f32(<4 x float> %in) {
   ret <4 x float> %1
 }
 
-define <2 x double> @frem_f64(<2 x double> %in) {
-; CHECK-LABEL: @frem_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[IN:%.*]], <2 x double> [[IN]])
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
-;
-  %1= frem <2 x double> %in, %in
-  ret <2 x double> %1
-}
-
-define <4 x float> @frem_f32(<4 x float> %in) {
-; CHECK-LABEL: @frem_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[IN:%.*]], <4 x float> [[IN]])
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
-;
-  %1= frem <4 x float> %in, %in
-  ret <4 x float> %1
-}
-
 declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
 declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index c9fe89a..c81fd26 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1925,11 +1925,8 @@ entry:
 define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-BASE-LABEL: test_udot_v8i8:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
-; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-SD-BASE-NEXT:    addv s0, v2.4s
+; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
@@ -1969,15 +1966,11 @@ entry:
 define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE-LABEL: test_udot_v16i8:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    umull v4.4s, v3.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-SD-BASE-NEXT:    umlal v4.4s, v1.4h, v0.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v4.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    umull2 v2.8h, v1.16b, v0.16b
+; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    uaddl2 v1.4s, v0.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -2025,21 +2018,16 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-BASE:       // %bb.0: // %entry
 ; CHECK-SD-BASE-NEXT:    ldr q0, [x0]
 ; CHECK-SD-BASE-NEXT:    ldr q1, [x1]
-; CHECK-SD-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-SD-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-SD-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    ushll v3.8h, v4.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v4.8h, v5.8b, #0
-; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
-; CHECK-SD-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
-; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-SD-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-SD-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-SD-BASE-NEXT:    umull v2.8h, v3.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    umull v3.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    umull2 v0.8h, v1.16b, v0.16b
+; CHECK-SD-BASE-NEXT:    uaddl2 v1.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    uaddl v2.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v2.4s, v0.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -2125,37 +2113,27 @@ entry:
 define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-BASE-LABEL: test_udot_v48i8:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-SD-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-SD-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-SD-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-SD-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
-; CHECK-SD-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
-; CHECK-SD-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v2.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v2.8h, v7.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
-; CHECK-SD-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
-; CHECK-SD-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
-; CHECK-SD-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v3.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v2.8h, v4.8b, #0
-; CHECK-SD-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
-; CHECK-SD-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
-; CHECK-SD-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-SD-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-BASE-NEXT:    ldp q4, q0, [x0, #16]
+; CHECK-SD-BASE-NEXT:    ldr q2, [x1, #32]
+; CHECK-SD-BASE-NEXT:    ldp q1, q5, [x1]
+; CHECK-SD-BASE-NEXT:    ldr q3, [x0]
+; CHECK-SD-BASE-NEXT:    umull2 v6.8h, v2.16b, v0.16b
+; CHECK-SD-BASE-NEXT:    umull v0.8h, v2.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    umull2 v7.8h, v1.16b, v3.16b
+; CHECK-SD-BASE-NEXT:    umull v1.8h, v1.8b, v3.8b
+; CHECK-SD-BASE-NEXT:    umull2 v2.8h, v5.16b, v4.16b
+; CHECK-SD-BASE-NEXT:    umull v3.8h, v5.8b, v4.8b
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v7.8h, v6.8h
+; CHECK-SD-BASE-NEXT:    uaddl2 v5.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    uaddl v6.4s, v7.4h, v6.4h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v4.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    uaddw2 v4.4s, v5.4s, v3.8h
+; CHECK-SD-BASE-NEXT:    uaddw v2.4s, v6.4s, v2.4h
+; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v3.4h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -2275,11 +2253,8 @@ entry:
 define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-BASE-LABEL: test_sdot_v8i8:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    smull v2.4s, v1.4h, v0.4h
-; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-SD-BASE-NEXT:    addv s0, v2.4s
+; CHECK-SD-BASE-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
@@ -2319,15 +2294,11 @@ entry:
 define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE-LABEL: test_sdot_v16i8:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    smull v4.4s, v3.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-SD-BASE-NEXT:    smlal v4.4s, v1.4h, v0.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v4.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    smull2 v2.8h, v1.16b, v0.16b
+; CHECK-SD-BASE-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    saddl2 v1.4s, v0.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -2375,21 +2346,16 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-BASE:       // %bb.0: // %entry
 ; CHECK-SD-BASE-NEXT:    ldr q0, [x0]
 ; CHECK-SD-BASE-NEXT:    ldr q1, [x1]
-; CHECK-SD-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-SD-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-SD-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    sshll v3.8h, v4.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v4.8h, v5.8b, #0
-; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
-; CHECK-SD-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
-; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-SD-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-SD-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-SD-BASE-NEXT:    smull v2.8h, v3.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    smull v3.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    smull2 v0.8h, v1.16b, v0.16b
+; CHECK-SD-BASE-NEXT:    saddl2 v1.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    saddl v2.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    saddw v0.4s, v2.4s, v0.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -2475,37 +2441,27 @@ entry:
 define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-BASE-LABEL: test_sdot_v48i8:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-SD-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-SD-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-SD-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-SD-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
-; CHECK-SD-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
-; CHECK-SD-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    sshll v1.8h, v2.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v2.8h, v7.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
-; CHECK-SD-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
-; CHECK-SD-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
-; CHECK-SD-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    sshll v1.8h, v3.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v2.8h, v4.8b, #0
-; CHECK-SD-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
-; CHECK-SD-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
-; CHECK-SD-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-SD-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-BASE-NEXT:    ldp q4, q0, [x0, #16]
+; CHECK-SD-BASE-NEXT:    ldr q2, [x1, #32]
+; CHECK-SD-BASE-NEXT:    ldp q1, q5, [x1]
+; CHECK-SD-BASE-NEXT:    ldr q3, [x0]
+; CHECK-SD-BASE-NEXT:    smull2 v6.8h, v2.16b, v0.16b
+; CHECK-SD-BASE-NEXT:    smull v0.8h, v2.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    smull2 v7.8h, v1.16b, v3.16b
+; CHECK-SD-BASE-NEXT:    smull v1.8h, v1.8b, v3.8b
+; CHECK-SD-BASE-NEXT:    smull2 v2.8h, v5.16b, v4.16b
+; CHECK-SD-BASE-NEXT:    smull v3.8h, v5.8b, v4.8b
+; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v7.8h, v6.8h
+; CHECK-SD-BASE-NEXT:    saddl2 v5.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    saddl v6.4s, v7.4h, v6.4h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v4.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    saddw2 v4.4s, v5.4s, v3.8h
+; CHECK-SD-BASE-NEXT:    saddw v2.4s, v6.4s, v2.4h
+; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v3.4h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -2626,26 +2582,22 @@ entry:
 define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-BASE-LABEL: test_udot_v8i8_multi_use:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
-; CHECK-SD-BASE-NEXT:    mov v3.16b, v2.16b
-; CHECK-SD-BASE-NEXT:    fmov w8, s2
-; CHECK-SD-BASE-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
-; CHECK-SD-BASE-NEXT:    addv s0, v3.4s
+; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    uaddlv s1, v0.8h
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-BASE-NEXT:    fmov w9, s0
-; CHECK-SD-BASE-NEXT:    add w0, w9, w8
+; CHECK-SD-BASE-NEXT:    fmov w8, s1
+; CHECK-SD-BASE-NEXT:    add w0, w8, w9
 ; CHECK-SD-BASE-NEXT:    ret
 ;
 ; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
 ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    ushll v3.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll v4.8h, v1.8b, #0
+; CHECK-SD-DOT-NEXT:    umull v3.8h, v1.8b, v0.8b
 ; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
-; CHECK-SD-DOT-NEXT:    umull v0.4s, v4.4h, v3.4h
-; CHECK-SD-DOT-NEXT:    addp v1.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v3.4h, #0
 ; CHECK-SD-DOT-NEXT:    fmov w9, s0
+; CHECK-SD-DOT-NEXT:    addp v1.2s, v2.2s, v2.2s
 ; CHECK-SD-DOT-NEXT:    fmov w8, s1
 ; CHECK-SD-DOT-NEXT:    add w0, w8, w9
 ; CHECK-SD-DOT-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index d6841d4..bd6155890 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -52,9 +52,9 @@
 ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, ptr addrspace(4) [[GEP1]], align 4, !range !2, !invariant.load !1
 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
 
-; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !3
-; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !3
-; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !3
+; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
 
 ; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
 ; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
@@ -68,11 +68,11 @@
 ; HSAOPT: %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(3) [[LOCAL_GEP]], i32 0, i32 1
 
 
-; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !1
-; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !1
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !2
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !2
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+; NOHSAOPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.y()
+; NOHSAOPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.z()
+; NOHSAOPT: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; NOHSAOPT: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; NOHSAOPT: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
 define amdgpu_kernel void @mova_same_clause(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
@@ -533,8 +533,3 @@ attributes #1 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
 !99 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 
 ; HSAOPT: !1 = !{}
-; HSAOPT: !2 = !{i32 0, i32 257}
-; HSAOPT: !3 = !{i32 0, i32 256}
-
-; NOHSAOPT: !1 = !{i32 0, i32 257}
-; NOHSAOPT: !2 = !{i32 0, i32 256}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index a62ea8f..84ea2be 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -22,75 +22,37 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
   ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, killed [[COPY4]], %subreg.sub3
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 80
@@ -108,6 +70,7 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
+  ; GCN-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
@@ -124,94 +87,95 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN2]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET8]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN2]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET8]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN12:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN13:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN-NEXT:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN-NEXT:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN12:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN13:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
@@ -269,113 +233,63 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN-NEXT:   [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY62:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY66:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY67:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN12]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN13]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN-NEXT:   [[COPY69:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN-NEXT:   [[COPY70:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY71:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY73:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY74:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY75:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY76:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY77:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN12]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN13]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0
   %tmp1 = load ptr addrspace(8), ptr addrspace(6) %arg0, align 16, !invariant.load !0
-  %buffer0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 16, i1 false, i1 false) #0
-  %buffer1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0
-  %buffer2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 16, i1 false, i1 false) #0
-  %buffer3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 16, i1 false, i1 false) #0
-
-  ; Insert inline asm to keep the different instruction types from being mixed.  This makes the output easier to read.
-  call void asm sideeffect "", "" ()
-
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer0, <4 x i32> %tmp0, i32 0, i32 32, i1 false, i1 false) #1
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer2, <4 x i32> %tmp0, i32 1, i32 32, i1 false, i1 false) #1
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer3, <4 x i32> %tmp0, i32 %arg1, i32 32, i1 false, i1 false) #1
-
-  call void asm sideeffect "", "" ()
-
-  %buffer_format0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 48, i1 false, i1 false) #0
-  %buffer_format1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0
-  %buffer_format2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 1, i32 48, i1 false, i1 false) #0
-  %buffer_format3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 48, i1 false, i1 false) #0
-
-  call void asm sideeffect "", "" ()
-
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format0, <4 x i32> %tmp0, i32 0, i32 64, i1 false, i1 false) #1
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format2, <4 x i32> %tmp0, i32 1, i32 64, i1 false, i1 false) #1
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format3, <4 x i32> %tmp0, i32 %arg1, i32 64, i1 false, i1 false) #1
-
-  call void asm sideeffect "", "" ()
-
-  %atomic_add0 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 80, i1 false) #2
-  %atomic_add1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
-  %atomic_add2 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 1, i32 80, i1 false) #2
-  %atomic_add3 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 80, i1 false) #2
-
-  call void asm sideeffect "", "" ()
-
-  %atomic_cmpswap0 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 96, i1 false) #2
-  %atomic_cmpswap1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
-  %atomic_cmpswap2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 1, i32 96, i1 false) #2
-  %atomic_cmpswap3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 96, i1 false) #2
-
-  call void asm sideeffect "", "" ()
-
-  %fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2
-  %fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
-  %fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2
-  %fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2
-
-  call void asm sideeffect "", "" ()
 
-  ; rsrc, offset, soffset, cachepolicy
   %raw_buffer0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 128, i32 0, i32 0) #0
   %raw_buffer1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 64, i32 64, i32 0) #0
   %raw_buffer2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 128, i32 0) #0
@@ -384,7 +298,6 @@ bb.0:
 
   call void asm sideeffect "", "" ()
 
-  ; rsrc, offset, soffset, cachepolicy
   %raw_ptr_buffer0 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 128, i32 0, i32 0) #3
   %raw_ptr_buffer1 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 64, i32 64, i32 0) #3
   %raw_ptr_buffer2 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 0, i32 128, i32 0) #3
@@ -473,7 +386,6 @@ bb.0:
 
   call void asm sideeffect "", "" ()
 
-  ; rsrc, vindex, offset, soffset, cachepolicy
   %struct_buffer0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 224, i32 0, i32 0) #0
   %struct_buffer1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 112, i32 112, i32 0) #0
   %struct_buffer2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 0, i32 224, i32 0) #0
@@ -593,13 +505,6 @@ bb.0:
   ret void
 }
 
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
-declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2
 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2
@@ -630,7 +535,6 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float>, ptr
 
 
 attributes #0 = { nounwind readonly }
-attributes #1 = { nounwind writeonly }
 attributes #2 = { nounwind }
 attributes #3 = { nounwind memory(argmem: read) }
 attributes #4 = { nounwind memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
index bdc73e5..7278639 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
@@ -4,46 +4,6 @@
 ; The buffer_loads and buffer_stores all access the same location. Check they do
 ; not get reordered by the scheduler.
 
-; GCN-LABEL: {{^}}_amdgpu_cs_main:
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-
-; Function Attrs: nounwind
-define amdgpu_cs void @_amdgpu_cs_main(<3 x i32> inreg %arg3, <3 x i32> %arg5) {
-.entry:
-  %tmp9 = add <3 x i32> %arg3, %arg5
-  %tmp10 = extractelement <3 x i32> %tmp9, i32 0
-  %tmp11 = shl i32 %tmp10, 2
-  %tmp12 = inttoptr i64 undef to ptr addrspace(4)
-  %tmp13 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
-  %tmp14 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp13, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp17 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
-  call void @llvm.amdgcn.buffer.store.f32(float %tmp14, <4 x i32> %tmp17, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp20 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
-  %tmp21 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp22 = fadd reassoc nnan arcp contract float %tmp21, 1.000000e+00
-  call void @llvm.amdgcn.buffer.store.f32(float %tmp22, <4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp25 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
-  %tmp26 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp27 = fadd reassoc nnan arcp contract float %tmp26, 1.000000e+00
-  call void @llvm.amdgcn.buffer.store.f32(float %tmp27, <4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp30 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
-  %tmp31 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp32 = fadd reassoc nnan arcp contract float %tmp31, 1.000000e+00
-  call void @llvm.amdgcn.buffer.store.f32(float %tmp32, <4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp35 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
-  %tmp36 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp37 = fadd reassoc nnan arcp contract float %tmp36, 1.000000e+00
-  call void @llvm.amdgcn.buffer.store.f32(float %tmp37, <4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  ret void
-}
-
 ; GCN-LABEL: {{^}}test1:
 ; GCN: buffer_store_dword
 ; GCN: buffer_load_dword
@@ -84,10 +44,6 @@ define amdgpu_cs void @test1_ptrs_reorderable(ptr addrspace(8) inreg %buf, i32 %
 }
 
 
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
-
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #3
-
 declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2
 
 declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #3
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 0a0179e..84bd9b6 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -1489,9 +1489,8 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_cvt_f32_f16_e64 v3, s6 clamp
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, 0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll
deleted file mode 100644
index 8ff78aa..0000000
--- a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=tahiti -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=hawaii -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=fiji -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-
-; Make sure selection of these intrinsics fails on targets that do not
-; have the instruction available.
-; FIXME: Should also really make sure the v2f16 version fails.
-
-; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD
-define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) {
-  %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false)
-  ret void
-}
-
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 7c5f6d5..301299d 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -372,52 +372,6 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
 }
 
 define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    movq %rax, %xmm1
-; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vmovq %rax, %xmm0
-; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -462,52 +416,6 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 }
 
 define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt2:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    movl $2, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    movq %rax, %xmm1
-; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt2:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    movl $2, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vmovq %rax, %xmm0
-; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $2, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $2, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -552,51 +460,6 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 }
 
 define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_select:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    andl $1, %esi
-; CHECK-SSE-NEXT:    movl $2, %eax
-; CHECK-SSE-NEXT:    subl %esi, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_select:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    andl $1, %esi
-; CHECK-AVX2-NEXT:    movl $2, %eax
-; CHECK-AVX2-NEXT:    subl %esi, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_select:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    andl $1, %esi
-; CHECK-NO-FASTFMA-NEXT:    movl $2, %eax
-; CHECK-NO-FASTFMA-NEXT:    subl %esi, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_select:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    andl $1, %esi
-; CHECK-FMA-NEXT:    movl $2, %eax
-; CHECK-FMA-NEXT:    subl %esi, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_select:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -640,55 +503,6 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
 }
 
 define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    movl $8, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    cmpq $8192, %rax # imm = 0x2000
-; CHECK-SSE-NEXT:    movl $8192, %ecx # imm = 0x2000
-; CHECK-SSE-NEXT:    cmovbq %rax, %rcx
-; CHECK-SSE-NEXT:    cvtsi2ss %rcx, %xmm0
-; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_fly_pow_mul_min_pow2:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    movl $8, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    cmpq $8192, %rax # imm = 0x2000
-; CHECK-AVX2-NEXT:    movl $8192, %ecx # imm = 0x2000
-; CHECK-AVX2-NEXT:    cmovbq %rax, %rcx
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_fly_pow_mul_min_pow2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    cmpq $8192, %rax # imm = 0x2000
-; CHECK-NO-FASTFMA-NEXT:    movl $8192, %ecx # imm = 0x2000
-; CHECK-NO-FASTFMA-NEXT:    cmovbq %rax, %rcx
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_fly_pow_mul_min_pow2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    cmpq $8192, %rax # imm = 0x2000
-; CHECK-FMA-NEXT:    movl $8192, %ecx # imm = 0x2000
-; CHECK-FMA-NEXT:    cmovbq %rax, %rcx
-; CHECK-FMA-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_fly_pow_mul_min_pow2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -757,63 +571,6 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 }
 
 define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $2, %eax
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    movl $1, %edx
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %edx
-; CHECK-SSE-NEXT:    cmpw %ax, %dx
-; CHECK-SSE-NEXT:    cmovbel %eax, %edx
-; CHECK-SSE-NEXT:    movzwl %dx, %eax
-; CHECK-SSE-NEXT:    cvtsi2sd %eax, %xmm0
-; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_mul_max_pow2:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $2, %eax
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    movl $1, %edx
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %edx
-; CHECK-AVX2-NEXT:    cmpw %ax, %dx
-; CHECK-AVX2-NEXT:    cmovbel %eax, %edx
-; CHECK-AVX2-NEXT:    movzwl %dx, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_mul_max_pow2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $2, %eax
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %edx
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %edx
-; CHECK-NO-FASTFMA-NEXT:    cmpw %ax, %dx
-; CHECK-NO-FASTFMA-NEXT:    cmovbel %eax, %edx
-; CHECK-NO-FASTFMA-NEXT:    movzwl %dx, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_mul_max_pow2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $2, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    movl $1, %ecx
-; CHECK-FMA-NEXT:    shlxl %edi, %ecx, %ecx
-; CHECK-FMA-NEXT:    cmpw %ax, %cx
-; CHECK-FMA-NEXT:    cmoval %ecx, %eax
-; CHECK-FMA-NEXT:    movzwl %ax, %eax
-; CHECK-FMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_mul_max_pow2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -852,48 +609,6 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
 }
 
 define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rsi, %rcx
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rdi
-; CHECK-SSE-NEXT:    movq %rdi, %xmm1
-; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rsi, %rcx
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rdi
-; CHECK-AVX2-NEXT:    vmovq %rdi, %xmm0
-; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rsi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rdi
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    shlxq %rsi, %rdi, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -938,90 +653,6 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 }
 
 define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2,2]
-; CHECK-SSE-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE-NEXT:    psllq %xmm2, %xmm1
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm3
-; CHECK-SSE-NEXT:    movq %xmm3, %rax
-; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB6_1
-; CHECK-SSE-NEXT:  # %bb.2:
-; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    jmp .LBB6_3
-; CHECK-SSE-NEXT:  .LBB6_1:
-; CHECK-SSE-NEXT:    movq %rax, %rcx
-; CHECK-SSE-NEXT:    shrq %rcx
-; CHECK-SSE-NEXT:    andl $1, %eax
-; CHECK-SSE-NEXT:    orq %rcx, %rax
-; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    addss %xmm0, %xmm0
-; CHECK-SSE-NEXT:  .LBB6_3:
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE-NEXT:    movq %xmm1, %rax
-; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB6_4
-; CHECK-SSE-NEXT:  # %bb.5:
-; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    jmp .LBB6_6
-; CHECK-SSE-NEXT:  .LBB6_4:
-; CHECK-SSE-NEXT:    movq %rax, %rcx
-; CHECK-SSE-NEXT:    shrq %rcx
-; CHECK-SSE-NEXT:    andl $1, %eax
-; CHECK-SSE-NEXT:    orq %rcx, %rax
-; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
-; CHECK-SSE-NEXT:  .LBB6_6:
-; CHECK-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vmovq %xmm1, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; CHECK-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
-; CHECK-AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpextrq $1, %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vmovq %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
-; CHECK-NO-FASTFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtuqq2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1110,60 +741,6 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 }
 
 define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
-; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
-; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    psrlq $32, %xmm1
-; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
-; CHECK-SSE-NEXT:    mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1228,64 +805,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 }
 
 define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pslld $23, %xmm0
-; CHECK-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2]
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT:    pmuludq %xmm2, %xmm0
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; CHECK-SSE-NEXT:    pand %xmm0, %xmm2
-; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-SSE-NEXT:    psrld $16, %xmm0
-; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    addps %xmm2, %xmm0
-; CHECK-SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    addps %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
-; CHECK-AVX2-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; CHECK-AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; CHECK-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1392508928,1392508928,1392508928,1392508928]
-; CHECK-AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
-; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
-; CHECK-AVX2-NEXT:    vsubps %xmm3, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vaddps %xmm0, %xmm2, %xmm0
-; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0]
-; CHECK-AVX2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vmulps %xmm2, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vzeroupper
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
-; CHECK-FMA-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
-; CHECK-FMA-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1356,60 +875,6 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float
 }
 
 define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
-; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
-; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    psrlq $32, %xmm1
-; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
-; CHECK-SSE-NEXT:    mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
-; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1476,60 +941,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 }
 
 define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2,1]
-; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
-; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    psrlq $32, %xmm1
-; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
-; CHECK-SSE-NEXT:    mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,1]
-; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,1]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,1]
-; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1594,158 +1005,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 }
 
 define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    subq $40, %rsp
-; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; CHECK-SSE-NEXT:    pslld $23, %xmm0
-; CHECK-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT:    pextrw $1, %xmm0, %eax
-; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE-NEXT:    addq $40, %rsp
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    subq $40, %rsp
-; CHECK-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
-; CHECK-AVX2-NEXT:    vpsllvd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT:    vpextrw $2, %xmm0, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX2-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-AVX2-NEXT:    addq $40, %rsp
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NO-FASTFMA-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovdw %zmm0, %ymm1
-; CHECK-NO-FASTFMA-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpextrw $1, %xmm1, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm1, %eax
-; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NO-FASTFMA-NEXT:    vmovaps {{.*#+}} xmm1 = [16,0,0,0]
-; CHECK-NO-FASTFMA-NEXT:    xorl %eax, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm2, %eax
-; CHECK-NO-FASTFMA-NEXT:    vmovd %eax, %xmm2
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastw %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm2
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm2, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
-; CHECK-NO-FASTFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vzeroupper
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
-; CHECK-FMA-NEXT:    vpsllvw %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vpextrw $7, %xmm0, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm1
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-FMA-NEXT:    vmovd %xmm1, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vpextrw $6, %xmm0, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm2
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-FMA-NEXT:    vmovd %xmm2, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT:    vpextrw $5, %xmm0, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm3, %xmm3
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-FMA-NEXT:    vmovd %xmm3, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT:    vpextrw $4, %xmm0, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm4, %xmm4
-; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm4, %xmm2
-; CHECK-FMA-NEXT:    vmovd %xmm2, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT:    vpextrw $3, %xmm0, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm4
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-FMA-NEXT:    vmovd %xmm4, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT:    vpextrw $2, %xmm0, %eax
-; CHECK-FMA-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm2
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-FMA-NEXT:    vmovd %xmm2, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT:    vpextrw $1, %xmm0, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm4
-; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm4, %xmm3
-; CHECK-FMA-NEXT:    vmovd %xmm3, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; CHECK-FMA-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-FMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %ymm0
-; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
-; CHECK-FMA-NEXT:    vzeroupper
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1789,52 +1048,6 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 }
 
 define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    movq %rax, %xmm1
-; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-SSE-NEXT:    addsd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vmovq %rax, %xmm0
-; CHECK-AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-AVX2-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1883,47 +1096,6 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 }
 
 define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
-; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    movzwl %ax, %eax
-; CHECK-SSE-NEXT:    cvtsi2sd %eax, %xmm0
-; CHECK-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_safe:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    movzwl %ax, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_safe:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fmul_pow_shl_cnt_safe:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    movzwl %ax, %eax
-; CHECK-FMA-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_safe:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1964,66 +1136,6 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
 }
 
 define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
-; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [4294967295,4294967295]
-; CHECK-SSE-NEXT:    andpd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    psrlq $32, %xmm1
-; CHECK-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    addpd %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0]
-; CHECK-SSE-NEXT:    divpd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
-; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
-; CHECK-AVX2-NEXT:    # xmm1 = mem[0,0]
-; CHECK-AVX2-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    # xmm1 = mem[0,0]
-; CHECK-NO-FASTFMA-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
-; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtuqq2pd %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
-; CHECK-FMA-NEXT:    # xmm1 = mem[0,0]
-; CHECK-FMA-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_vec:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2065,94 +1177,6 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 }
 
 define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; CHECK-SSE-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE-NEXT:    psllq %xmm1, %xmm2
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm3
-; CHECK-SSE-NEXT:    movq %xmm3, %rax
-; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB15_1
-; CHECK-SSE-NEXT:  # %bb.2:
-; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    jmp .LBB15_3
-; CHECK-SSE-NEXT:  .LBB15_1:
-; CHECK-SSE-NEXT:    movq %rax, %rcx
-; CHECK-SSE-NEXT:    shrq %rcx
-; CHECK-SSE-NEXT:    andl $1, %eax
-; CHECK-SSE-NEXT:    orq %rcx, %rax
-; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
-; CHECK-SSE-NEXT:  .LBB15_3:
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
-; CHECK-SSE-NEXT:    movq %xmm0, %rax
-; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB15_4
-; CHECK-SSE-NEXT:  # %bb.5:
-; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    jmp .LBB15_6
-; CHECK-SSE-NEXT:  .LBB15_4:
-; CHECK-SSE-NEXT:    movq %rax, %rcx
-; CHECK-SSE-NEXT:    shrq %rcx
-; CHECK-SSE-NEXT:    andl $1, %eax
-; CHECK-SSE-NEXT:    orq %rcx, %rax
-; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    addss %xmm0, %xmm0
-; CHECK-SSE-NEXT:  .LBB15_6:
-; CHECK-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-SSE-NEXT:    movaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u>
-; CHECK-SSE-NEXT:    divps %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
-; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vmovq %xmm1, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; CHECK-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; CHECK-AVX2-NEXT:    vdivps %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
-; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpextrq $1, %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm1
-; CHECK-NO-FASTFMA-NEXT:    vmovq %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
-; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtuqq2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; CHECK-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2187,65 +1211,6 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nou
 }
 
 define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    movl $8, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB16_1
-; CHECK-SSE-NEXT:  # %bb.2:
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    jmp .LBB16_3
-; CHECK-SSE-NEXT:  .LBB16_1:
-; CHECK-SSE-NEXT:    shrq %rax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
-; CHECK-SSE-NEXT:  .LBB16_3:
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    movl $8, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    testq %rax, %rax
-; CHECK-AVX2-NEXT:    js .LBB16_1
-; CHECK-AVX2-NEXT:  # %bb.2:
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    jmp .LBB16_3
-; CHECK-AVX2-NEXT:  .LBB16_1:
-; CHECK-AVX2-NEXT:    shrq %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:  .LBB16_3:
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2337,47 +1302,6 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 }
 
 define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    movl $8, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    movl $8, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2483,51 +1407,6 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 }
 
 define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    andb $31, %cl
-; CHECK-SSE-NEXT:    movl $8, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    andb $31, %cl
-; CHECK-AVX2-NEXT:    movl $8, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    andb $31, %cl
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    andb $31, %dil
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2560,70 +1439,6 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 }
 
 define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pushq %rax
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    popq %rax
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    pushq %rax
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    popq %rax
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2674,74 +1489,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 }
 
 define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pushq %rax
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    movzwl %ax, %eax
-; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    popq %rax
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    pushq %rax
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    movzwl %ax, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    popq %rax
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    movzwl %ax, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2770,74 +1517,6 @@ define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
 }
 
 define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds2:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pushq %rax
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    movzwl %ax, %eax
-; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    popq %rax
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds2:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    pushq %rax
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    movzwl %ax, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    popq %rax
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    movzwl %ax, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2866,74 +1545,6 @@ define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
 }
 
 define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pushq %rax
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    movzwl %ax, %eax
-; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
-; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    popq %rax
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    pushq %rax
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    movzwl %ax, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
-; CHECK-AVX2-NEXT:    popq %rax
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    movzwl %ax, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
-; CHECK-FMA-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2980,47 +1591,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 }
 
 define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    cvtsi2sd %rax, %xmm1
-; CHECK-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-SSE-NEXT:    divsd %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-AVX2-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NO-FASTFMA-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2sd %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-FMA-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3056,47 +1626,6 @@ define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
 }
 
 define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3164,47 +1693,6 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 }
 
 define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
-; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay:
-; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movl %edi, %ecx
-; CHECK-SSE-NEXT:    movl $1, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-SSE-NEXT:    shll %cl, %eax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
-; CHECK-SSE-NEXT:    retq
-;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_okay:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movl %edi, %ecx
-; CHECK-AVX2-NEXT:    movl $1, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_okay:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movl %edi, %ecx
-; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_okay:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    movl $1, %eax
-; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt32_okay:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
deleted file mode 100644
index b35de03..0000000
--- a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
+++ /dev/null
@@ -1,141 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx941 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -mattr=-forcestoresc1 < %s | FileCheck --check-prefixes=GCN,NOSC0SC1 %s
-
-define amdgpu_kernel void @store_global(ptr addrspace(1) %ptr) {
-; FORCESC0SC1-LABEL: store_global:
-; FORCESC0SC1:       ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FORCESC0SC1-NEXT:    v_mov_b32_e32 v0, 0
-; FORCESC0SC1-NEXT:    v_mov_b32_e32 v1, 1.0
-; FORCESC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
-; FORCESC0SC1-NEXT:    s_endpgm
-;
-; NOSC0SC1-LABEL: store_global:
-; NOSC0SC1:       ; %bb.0: ; %entry
-; NOSC0SC1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOSC0SC1-NEXT:    v_mov_b32_e32 v0, 0
-; NOSC0SC1-NEXT:    v_mov_b32_e32 v1, 1.0
-; NOSC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT:    global_store_dword v0, v1, s[0:1]
-; NOSC0SC1-NEXT:    s_endpgm
-entry:
-  store float 1.000000e+00, ptr addrspace(1) %ptr, align 4
-  ret void
-}
-
-define amdgpu_kernel void @store_flat(ptr addrspace(0) %ptr) {
-; FORCESC0SC1-LABEL: store_flat:
-; FORCESC0SC1:       ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FORCESC0SC1-NEXT:    v_mov_b32_e32 v2, 1.0
-; FORCESC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; FORCESC0SC1-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
-; FORCESC0SC1-NEXT:    s_endpgm
-;
-; NOSC0SC1-LABEL: store_flat:
-; NOSC0SC1:       ; %bb.0: ; %entry
-; NOSC0SC1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOSC0SC1-NEXT:    v_mov_b32_e32 v2, 1.0
-; NOSC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; NOSC0SC1-NEXT:    flat_store_dword v[0:1], v2
-; NOSC0SC1-NEXT:    s_endpgm
-entry:
-  store float 1.000000e+00, ptr addrspace(0) %ptr, align 4
-  ret void
-}
-
-define amdgpu_kernel void @store_lds(ptr addrspace(3) %ptr) {
-; GCN-LABEL: store_lds:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    ds_write_b32 v1, v0
-; GCN-NEXT:    s_endpgm
-entry:
-  store float 1.000000e+00, ptr addrspace(3) %ptr, align 4
-  ret void
-}
-
-define amdgpu_kernel void @store_scratch(ptr addrspace(5) %ptr) {
-; FORCESC0SC1-LABEL: store_scratch:
-; FORCESC0SC1:       ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT:    s_load_dword s0, s[0:1], 0x24
-; FORCESC0SC1-NEXT:    v_mov_b32_e32 v0, 1.0
-; FORCESC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT:    scratch_store_dword off, v0, s0 sc0 sc1
-; FORCESC0SC1-NEXT:    s_endpgm
-;
-; NOSC0SC1-LABEL: store_scratch:
-; NOSC0SC1:       ; %bb.0: ; %entry
-; NOSC0SC1-NEXT:    s_load_dword s0, s[0:1], 0x24
-; NOSC0SC1-NEXT:    v_mov_b32_e32 v0, 1.0
-; NOSC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT:    scratch_store_dword off, v0, s0
-; NOSC0SC1-NEXT:    s_endpgm
-entry:
-  store float 1.000000e+00, ptr addrspace(5) %ptr, align 4
-  ret void
-}
-
-define amdgpu_ps void @store_buffer(<4 x i32> inreg %rsrc, float %data, i32 %index) {
-; FORCESC0SC1-LABEL: store_buffer:
-; FORCESC0SC1:       ; %bb.0: ; %main_body
-; FORCESC0SC1-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen sc0 sc1
-; FORCESC0SC1-NEXT:    s_endpgm
-;
-; NOSC0SC1-LABEL: store_buffer:
-; NOSC0SC1:       ; %bb.0: ; %main_body
-; NOSC0SC1-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
-; NOSC0SC1-NEXT:    s_endpgm
-main_body:
-  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-define amdgpu_kernel void @store_global_atomic(ptr addrspace(1) %ptr) {
-; FORCESC0SC1-LABEL: store_global_atomic:
-; FORCESC0SC1:       ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FORCESC0SC1-NEXT:    v_mov_b32_e32 v0, 0
-; FORCESC0SC1-NEXT:    v_mov_b32_e32 v1, 1.0
-; FORCESC0SC1-NEXT:    buffer_wbl2 sc1
-; FORCESC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
-; FORCESC0SC1-NEXT:    s_endpgm
-;
-; NOSC0SC1-LABEL: store_global_atomic:
-; NOSC0SC1:       ; %bb.0: ; %entry
-; NOSC0SC1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOSC0SC1-NEXT:    v_mov_b32_e32 v0, 0
-; NOSC0SC1-NEXT:    v_mov_b32_e32 v1, 1.0
-; NOSC0SC1-NEXT:    buffer_wbl2 sc1
-; NOSC0SC1-NEXT:    s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT:    global_store_dword v0, v1, s[0:1] sc1
-; NOSC0SC1-NEXT:    s_endpgm
-entry:
-  store atomic float 1.000000e+00, ptr addrspace(1) %ptr syncscope("agent-one-as") seq_cst, align 4
-  ret void
-}
-
-define amdgpu_kernel void @store_global_atomic_system(ptr addrspace(1) %ptr) {
-; GCN-LABEL: store_global_atomic_system:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
-; GCN-NEXT:    s_endpgm
-  store atomic float 1.000000e+00, ptr addrspace(1) %ptr monotonic, align 4
-  ret void
-}
-
-
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 121fab5..0835b43 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -2,7 +2,6 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
 
-declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1)
 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
 declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg)
@@ -24,89 +23,6 @@ declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
 
-define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
-; GFX90A-LABEL: buffer_atomic_add_noret_f64:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
-; GFX90A-NEXT:    s_endpgm
-;
-; GFX940-LABEL: buffer_atomic_add_noret_f64:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX940-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, s8
-; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
-; GFX940-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  ret void
-}
-
-define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
-; GFX90A-LABEL: buffer_atomic_add_rtn_f64:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT:    s_endpgm
-;
-; GFX940-LABEL: buffer_atomic_add_rtn_f64:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GFX940-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  store double %ret, ptr undef
-  ret void
-}
-
-define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
-; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT:    s_load_dword s10, s[0:1], 0x3c
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
-; GFX90A-NEXT:    s_endpgm
-;
-; GFX940-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX940-NEXT:    s_load_dword s10, s[0:1], 0x3c
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
-; GFX940-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  store double %ret, ptr addrspace(1) %out, align 8
-  ret void
-}
-
 define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
@@ -1186,7 +1102,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB42_3
+; GFX90A-NEXT:    s_cbranch_execz .LBB39_3
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
@@ -1198,7 +1114,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB42_2: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX90A-NEXT:    buffer_wbl2
@@ -1210,8 +1126,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_2
-; GFX90A-NEXT:  .LBB42_3:
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_2
+; GFX90A-NEXT:  .LBB39_3:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
@@ -1221,7 +1137,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB42_2
+; GFX940-NEXT:    s_cbranch_execz .LBB39_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -1233,7 +1149,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:  .LBB42_2:
+; GFX940-NEXT:  .LBB39_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -1248,7 +1164,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB43_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -1259,7 +1175,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:  .LBB43_2:
+; GFX90A-NEXT:  .LBB40_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent:
@@ -1269,7 +1185,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB43_2
+; GFX940-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -1281,7 +1197,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:  .LBB43_2:
+; GFX940-NEXT:  .LBB40_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1296,7 +1212,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB44_3
+; GFX90A-NEXT:    s_cbranch_execz .LBB41_3
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
@@ -1308,7 +1224,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB44_2: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX90A-NEXT:    buffer_wbl2
@@ -1320,8 +1236,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_2
-; GFX90A-NEXT:  .LBB44_3:
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_2
+; GFX90A-NEXT:  .LBB41_3:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
@@ -1331,7 +1247,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB44_2
+; GFX940-NEXT:    s_cbranch_execz .LBB41_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -1343,7 +1259,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:  .LBB44_2:
+; GFX940-NEXT:  .LBB41_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1358,7 +1274,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB45_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -1369,7 +1285,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:  .LBB45_2:
+; GFX90A-NEXT:  .LBB42_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush:
@@ -1379,7 +1295,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB45_2
+; GFX940-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -1391,7 +1307,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:  .LBB45_2:
+; GFX940-NEXT:  .LBB42_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1423,7 +1339,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1436,7 +1352,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
@@ -1488,7 +1404,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1501,7 +1417,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
@@ -1568,7 +1484,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB52_3
+; GFX90A-NEXT:    s_cbranch_execz .LBB49_3
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
@@ -1580,7 +1496,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB52_2: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
@@ -1590,8 +1506,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_2
-; GFX90A-NEXT:  .LBB52_3:
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_2
+; GFX90A-NEXT:  .LBB49_3:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
@@ -1601,7 +1517,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB52_2
+; GFX940-NEXT:    s_cbranch_execz .LBB49_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
@@ -1613,7 +1529,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:  .LBB52_2:
+; GFX940-NEXT:  .LBB49_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1629,7 +1545,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], 4.0
@@ -1642,7 +1558,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -1700,7 +1616,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], 4.0
@@ -1714,7 +1630,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -1740,7 +1656,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1753,7 +1669,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
@@ -1805,7 +1721,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1819,7 +1735,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
@@ -1896,7 +1812,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], 4.0
@@ -1907,7 +1823,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 ;
@@ -2123,7 +2039,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB70_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2133,7 +2049,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:  .LBB70_2:
+; GFX90A-NEXT:  .LBB67_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
@@ -2143,7 +2059,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB70_2
+; GFX940-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2153,7 +2069,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:  .LBB70_2:
+; GFX940-NEXT:  .LBB67_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2168,7 +2084,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB71_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB68_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2178,7 +2094,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:  .LBB71_2:
+; GFX90A-NEXT:  .LBB68_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
@@ -2188,7 +2104,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB71_2
+; GFX940-NEXT:    s_cbranch_execz .LBB68_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2198,7 +2114,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:  .LBB71_2:
+; GFX940-NEXT:  .LBB68_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2213,7 +2129,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB72_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB69_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2223,7 +2139,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:  .LBB72_2:
+; GFX90A-NEXT:  .LBB69_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2233,7 +2149,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB72_2
+; GFX940-NEXT:    s_cbranch_execz .LBB69_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
@@ -2243,7 +2159,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:  .LBB72_2:
+; GFX940-NEXT:  .LBB69_2:
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
deleted file mode 100644
index eed648f..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ /dev/null
@@ -1,209 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
-
-;CHECK-LABEL: {{^}}test1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc
-;SICI: v_mov_b32_e32 v1, 0x2000
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
-;CHECK-DAG: s_waitcnt vmcnt(0)
-;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
-;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
-define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
-main_body:
-  %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
-  %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
-  %ofs.5 = add i32 %voffset, 42
-  %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
-  %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
-  %unused = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %out = bitcast i32 %o6 to float
-  ret float %out
-}
-
-;CHECK-LABEL: {{^}}test11:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0 glc
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v[1:2], s[0:3], 0 idxen offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen offset:42 glc
-;CHECK-DAG: s_waitcnt vmcnt(0)
-;SICI: buffer_atomic_swap_x2 v[3:4], v0, s[0:3], 0 offen glc
-;VI: buffer_atomic_swap_x2 v[3:4], off, s[0:3], [[SOFS]] offset:4 glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0{{$}}
-define amdgpu_ps float @test11(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
-main_body:
-  %o0 = sext i32 %data to i64
-  %o1 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o0, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %o2 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %o3 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
-  %o4 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
-  %ofs.5 = add i32 %voffset, 42
-  %o5 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
-  %o6 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
-  %unused = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %o7 = trunc i64 %o6 to i32
-  %out = bitcast i32 %o7 to float
-  ret float %out
-}
-
-;CHECK-LABEL: {{^}}test2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
-define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
-  %t1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t7 = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t8 = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %out = bitcast i32 %t9 to float
-  ret float %out
-}
-
-;CHECK-LABEL: {{^}}test3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_add_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_sub_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smin_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umin_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smax_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umax_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_and_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_or_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_xor_x2 v[0:1], v2, s[0:3], 0 idxen glc
-define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
-  %t0 = sext i32 %data to i64
-  %t1 = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 %t0, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t2 = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t3 = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t4 = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t5 = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t6 = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t7 = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t8 = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t9 = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %t10 = trunc i64 %t9 to i32
-  %out = bitcast i32 %t10 to float
-  ret float %out
-}
-
-; Ideally, we would teach tablegen & friends that cmpswap only modifies the
-; first vgpr. Since we don't do that yet, the register allocator will have to
-; create copies which we don't bother to track here.
-;
-;CHECK-LABEL: {{^}}test4:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
-;CHECK: s_waitcnt vmcnt(0)
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc
-;CHECK-DAG: s_waitcnt vmcnt(0)
-;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
-;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
-define amdgpu_ps float @test4(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
-main_body:
-  %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
-  %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
-  %ofs.5 = add i32 %voffset, 44
-  %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
-  %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
-
-; Detecting the no-return variant doesn't work right now because of how the
-; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
-; Since there probably isn't a reasonable use-case of cmpswap that discards
-; the return value, that seems okay.
-;
-;  %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
-  %out = bitcast i32 %o6 to float
-  ret float %out
-}
-
-;CHECK-LABEL: {{^}}test7:
-;CHECK: buffer_atomic_add v0,
-define amdgpu_ps float @test7() {
-main_body:
-  %v = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false)
-  %v.float = bitcast i32 %v to float
-  ret float %v.float
-}
-
-declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, <4 x i32>, i32, i32, i1) #0
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
index fdbe6db..659842a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
@@ -1,26 +1,6 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,SI
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,GCNX3
 
-;CHECK-LABEL: {{^}}buffer_load_format_immoffs_x3:
-;SI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
-;GCNX3: buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:42
-;CHECK: s_waitcnt
-define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(<4 x i32> inreg) {
-main_body:
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
-  ret <3 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_x3:
-;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
-;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40
-;CHECK: s_waitcnt
-define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(<4 x i32> inreg) {
-main_body:
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
-  ret <3 x float> %data
-}
-
 ;CHECK-LABEL: {{^}}buffer_raw_load_immoffs_x3:
 ;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
 ;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40
@@ -81,8 +61,6 @@ main_body:
   ret <3 x float> %data
 }
 
-declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #0
 declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #0
 declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
deleted file mode 100644
index 3d67dfd..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0
-define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  ret half %data
-}
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
-define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  %elt = extractelement <2 x half> %data, i32 1
-  ret half %elt
-}
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  %elt = extractelement <3 x half> %data, i32 2
-  ret half %elt
-}
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
-define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  %elt = extractelement <4 x half> %data, i32 3
-  ret half %elt
-}
-
-declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1)
-declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1)
-declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32>, i32, i32, i1, i1)
-declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
deleted file mode 100644
index 6851302..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
-
-;CHECK-LABEL: {{^}}buffer_load:
-;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
-;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
-;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
-  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
-  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
-  %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
-  %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
-  %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
-  ret {<4 x float>, <4 x float>, <4 x float>} %r2
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x1038
-;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen
-;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
-;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
-;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
-main_body:
-  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0)
-  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36856, i1 0, i1 0)
-  %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
-  %d.3 = fadd <4 x float> %d.0, %d.1
-  %data = fadd <4 x float> %d.2, %d.3
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
-;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xffc
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:68
-;VI-NOT: s_mov
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84
-;VI: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) {
-main_body:
-  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
-  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
-  %data = fadd <4 x float> %d.0, %d.1
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_idx:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
-main_body:
-  %ofs = add i32 %1, 60
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both:
-;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both_reversed:
-;CHECK: v_mov_b32_e32 v2, v0
-;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x:
-;CHECK: buffer_load_format_x v0, off, s[0:3], 0
-;CHECK: s_waitcnt
-define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  ret float %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_xy:
-;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0
-;CHECK: s_waitcnt
-define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  ret <2 x float> %data
-}
-
-declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
deleted file mode 100644
index a209dcf..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ /dev/null
@@ -1,476 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
-
-;CHECK-LABEL: {{^}}buffer_load:
-;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
-;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
-  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
-  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
-  %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
-  %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
-  %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
-  ret {<4 x float>, <4 x float>, <4 x float>} %r2
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
-;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
-;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_idx:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
-main_body:
-  %ofs = add i32 %1, 60
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both_reversed:
-;CHECK: v_mov_b32_e32 v2, v0
-;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
-main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1:
-;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
-main_body:
-  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
-  ret float %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x2:
-;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
-main_body:
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
-  ret <2 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_negative_offset:
-;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
-;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
-define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
-main_body:
-  %ofs.1 = add i32 %ofs, -16
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0)
-  ret <4 x float> %data
-}
-
-; SI won't merge ds memory operations, because of the signed offset bug, so
-; we only have check lines for VI.
-; CHECK-LABEL: buffer_load_mmo:
-; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
-entry:
-  store float 0.0, ptr addrspace(3) %lds
-  %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
-  store float 0.0, ptr addrspace(3) %tmp2
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 8
-  %a3 = add i32 %a, 12
-  %a4 = add i32 %a, 16
-  %a5 = add i32 %a, 28
-  %a6 = add i32 %a, 32
-  %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
-  %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
-  %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
-  %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 8
-  %a3 = add i32 %a, 12
-  %a4 = add i32 %a, 16
-  %a5 = add i32 %a, 28
-  %a6 = add i32 %a, 32
-  %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
-  %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
-  %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
-  %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 12
-  %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  %r1 = extractelement <2 x float> %vr1, i32 0
-  %r2 = extractelement <2 x float> %vr1, i32 1
-  %r3 = extractelement <2 x float> %vr2, i32 0
-  %r4 = extractelement <2 x float> %vr2, i32 1
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged:
-;CHECK-NEXT: %bb.
-;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 12
-  %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  %r1 = extractelement <2 x float> %vr1, i32 0
-  %r2 = extractelement <2 x float> %vr1, i32 1
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
-main_body:
-  %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
-  %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
-  %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
-  %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0)
-  %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
-main_body:
-  %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
-  %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
-  %r1 = extractelement <2 x float> %vr1, i32 0
-  %r2 = extractelement <2 x float> %vr1, i32 1
-  %r3 = extractelement <2 x float> %vr2, i32 0
-  %r4 = extractelement <2 x float> %vr2, i32 1
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged:
-;CHECK-NEXT: %bb.
-;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
-main_body:
-  %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
-  %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
-  %r1 = extractelement <2 x float> %vr1, i32 0
-  %r2 = extractelement <2 x float> %vr1, i32 1
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ubyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %val = uitofp i8 %tmp to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ushort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
-  %tmp2 = zext i16 %tmp to i32
-  %val = uitofp i32 %tmp2 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %tmp2 = sext i8 %tmp to i32
-  %val = sitofp i32 %tmp2 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sshort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:16
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
-  %tmp2 = sext i16 %tmp to i32
-  %val = sitofp i32 %tmp2 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %tmp2 = zext i8 %tmp to i32
-  %val = bitcast i32 %tmp2 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %tmp2 = zext i16 %tmp to i32
-  %val = bitcast i32 %tmp2 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %tmp2 = sext i8 %tmp to i32
-  %val = bitcast i32 %tmp2 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %tmp2 = sext i16 %tmp to i32
-  %val = bitcast i32 %tmp2 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ubyte_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
-  %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
-  %tmp2 = zext i8 %tmp to i32
-  %tmp3 = mul i32 %tmp2, 255
-  %val = bitcast i32 %tmp3 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ushort_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
-  %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
-  %tmp2 = zext i16 %tmp to i32
-  %tmp3 = mul i32 %tmp2, 255
-  %val = bitcast i32 %tmp3 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
-  %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
-  %tmp2 = sext i8 %tmp to i32
-  %tmp3 = mul i32 %tmp2, 255
-  %val = bitcast i32 %tmp3 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sshort_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
-  %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
-  %tmp2 = sext i16 %tmp to i32
-  %tmp3 = mul i32 %tmp2, 255
-  %val = bitcast i32 %tmp3 to float
-  ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte_type_check:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) {
-main_body:
-  %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  %tmp2 = zext i8 %tmp to i32
-  %tmp3 = shl i32 %tmp2, 27
-  %tmp4 = ashr i32 %tmp3, 27
-  %val = bitcast i32 %tmp4 to float
-  ret float %val
-}
-
-; Make sure a frame index folding doesn't crash on a MUBUF not used
-; for stack access.
-
-; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
-; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
-define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
-  %alloca = alloca i32, addrspace(5)
-  %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32
-
-  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
-  ret float %ret.val
-}
-
-; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
-; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
-; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]]
-define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
-  %alloca = alloca i32, addrspace(5)
-  %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32
-
-  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
-  ret float %ret.val
-}
-
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0
-declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
index 269956f..7723b56 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
@@ -1,23 +1,5 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
 
-;CHECK-LABEL: {{^}}buffer_store_format_immoffs_x3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_immoffs_x3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
-  ret void
-}
-
 ;CHECK-LABEL: {{^}}raw_buffer_store_format_immoffs_x3:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42
@@ -72,8 +54,6 @@ main_body:
   ret void
 }
 
-declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0
 declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0
 declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0
 declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
deleted file mode 100644
index a8cabdc..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-
-; GCN-LABEL: {{^}}buffer_store_format_d16_x:
-; GCN: s_load_dword s[[LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
-; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
-
-; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}}
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %index) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-
-; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-
-; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1)
-declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1)
-declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i1, i1)
-declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
deleted file mode 100644
index 41e2b4d..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc
-define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
-  ret void
-}
-
-; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
deleted file mode 100644
index 8b18848..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ /dev/null
@@ -1,268 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
-define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
-  ret void
-}
-
-; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
-main_body:
-  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
-main_body:
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 8
-  %a3 = add i32 %a, 12
-  %a4 = add i32 %a, 16
-  %a5 = add i32 %a, 28
-  %a6 = add i32 %a, 32
-  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
-define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 8
-  %a3 = add i32 %a, 12
-  %a4 = add i32 %a, 16
-  %a5 = add i32 %a, 28
-  %a6 = add i32 %a, 32
-  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
-  call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 12
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) {
-  %a1 = add i32 %a, 28
-  %a2 = add i32 %a, 32
-  %a3 = add i32 %a, 36
-  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) {
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 12
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) {
-  %a1 = add i32 %a, 4
-  %a2 = add i32 %a, 8
-  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
-define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
-  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) {
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) {
-  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) {
-  call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8
-define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) {
-  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_byte:
-;CHECK-NOT: s_waitcnt
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8
-define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
-main_body:
-  %v2 = fptoui float %v1 to i32
-  %v3 = trunc i32 %v2 to i8
-  call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_short:
-;CHECK-NOT: s_waitcnt
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16
-define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
-main_body:
-  %v2 = fptoui float %v1 to i32
-  %v3 = trunc i32 %v2 to i16
-  call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
-  ret void
-}
-
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
deleted file mode 100644
index bd1888b..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-declare void @llvm.amdgcn.buffer.wbinvl1() #0
-
-; GCN-LABEL: {{^}}test_buffer_wbinvl1:
-; GCN-NEXT: ; %bb.0:
-; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
-; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
-; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @test_buffer_wbinvl1() #0 {
-  call void @llvm.amdgcn.buffer.wbinvl1()
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
deleted file mode 100644
index b937c42..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
-
-declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0
-
-; SI-LABEL: {{^}}test_buffer_wbinvl1_sc:
-; SI-NEXT: ; %bb.0:
-; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
-; SI-NEXT: s_endpgm
-define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 {
-  call void @llvm.amdgcn.buffer.wbinvl1.sc()
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
deleted file mode 100644
index 64ab8ec..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0
-
-; GCN-LABEL: {{^}}test_buffer_wbinvl1_vol:
-; GCN-NEXT: ; %bb.0:
-; CI: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
-; VI: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
-; GCN: _store_byte
-; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @test_buffer_wbinvl1_vol(ptr addrspace(1) %ptr) #0 {
-  call void @llvm.amdgcn.buffer.wbinvl1.vol()
-; This used to crash in hazard recognizer
-  store i8 0, ptr addrspace(1) %ptr, align 1
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
index 2b0584d..d7dd0ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
@@ -2,45 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX12PLUS
 
-declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1)
 declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
 
-; GCN-LABEL: {{^}}buffer_atomic_csub_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
-define amdgpu_ps void @buffer_atomic_csub_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
-  %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_csub_no_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen
-define amdgpu_ps void @buffer_atomic_csub_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
-main_body:
-  %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
-define amdgpu_ps void @buffer_atomic_csub_off4_slc_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
-  %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_no_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT
-define amdgpu_ps void @buffer_atomic_csub_off4_slc_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
-main_body:
-  %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  ret void
-}
-
 ; GCN-LABEL: {{^}}global_atomic_csub_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
 ; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll
index 1e1bc2b..af84105 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll
@@ -1,45 +1,8 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
-; RUN: not --crash llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
 
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
-declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
 declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
 declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
 
-; GFX908:  LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD
-
-; GFX90A-LABEL: {{^}}buffer_atomic_add_f32:
-; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc
-define amdgpu_ps float @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
-  %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_off4_slc:
-; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 glc slc
-define amdgpu_ps float @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
-  %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16:
-; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen glc
-define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
-  %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  ret <2 x half> %ret
-}
-
-; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc:
-; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 glc slc
-define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
-  %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  ret <2 x half> %ret
-}
-
 ; GFX90A-LABEL: {{^}}global_atomic_add_f32:
 ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc
 define amdgpu_ps float @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll
index bd07dd1..0c3ce33 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll
@@ -1,44 +1,10 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN
 
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
-declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
 declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
 declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float)
 
-; GCN-LABEL: {{^}}buffer_atomic_add_f32:
-; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
-  %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_add_f32_off4_slc:
-; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc
-define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
-  %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16:
-; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
-  %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc:
-; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc
-define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
-  %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  ret void
-}
-
 ; GCN-LABEL: {{^}}global_atomic_add_f32:
 ; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
index 5c917c9..5d9daae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
@@ -1,67 +1,58 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=CHECK %s
 
-define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
 ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen
-; CHECK-NEXT:    s_endpgm
-  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 24
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
   ret void
 }
 
-define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
+define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
 ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_add_f32 v0, off, s[8:11], s6
-; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_add_f32 v0, off, s[4:7], s8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
-define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
 ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen
-; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
-define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
 ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92
-; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
   ret void
 }
 
-define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
 ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc
-; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.xfail.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.xfail.ll
new file mode 100644
index 0000000..c8273b3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.xfail.ll
@@ -0,0 +1,8 @@
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Do not know how to widen the result of this operator!
+
+define <6 x bfloat> @raw_ptr_buffer_load_v6bf16(ptr addrspace(8) inreg %rsrc) {
+  %val = call <6 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v6bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret <6 x bfloat> %val
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.xfail.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.xfail.ll
new file mode 100644
index 0000000..e636c10
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.xfail.ll
@@ -0,0 +1,11 @@
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; FIXME: This should be handled
+
+; CHECK: LLVM ERROR: Do not know how to widen this operator's operand!
+
+
+define void @buffer_store_v6bf16(ptr addrspace(8) inreg %rsrc, <6 x bfloat> %data, i32 %offset) {
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v6bf16(<6 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll
index 0bdb21f..5401de0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll
@@ -1,57 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck %s -check-prefix=CHECK
 
-
-define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
 ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen
-; CHECK-NEXT:    s_endpgm
-  %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 24
+  %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
   ret void
 }
 
 ; Natural mapping, no voffset
-define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
+define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
 ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen
-; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
-define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
 ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc
-; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen slc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
 
-define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
 ; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s11, s5
-; CHECK-NEXT:    s_mov_b32 s10, s4
-; CHECK-NEXT:    s_mov_b32 s9, s3
-; CHECK-NEXT:    s_mov_b32 s8, s2
-; CHECK-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen
-; CHECK-NEXT:    s_endpgm
-  %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[4:7], s8 idxen offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 24
+  %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
index 98ed437..47b7658 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
@@ -313,7 +313,7 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
   call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0)
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i32 0, i32 0)
   %data.i = bitcast <4 x float> %data to <4 x i32>
   call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0)
   ret void
@@ -620,7 +620,7 @@ declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32
 declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
 declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
 declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
deleted file mode 100644
index 06d66ce..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_x:
-; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
-  ret half %data
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
-; UNPACKED: tbuffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
-define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
-  %elt = extractelement <2 x half> %data, i32 1
-  ret half %elt
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
-; UNPACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
-  %elt = extractelement <3 x half> %data, i32 2
-  ret half %elt
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
-; UNPACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
-define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
-main_body:
-  %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
-  %elt = extractelement <4 x half> %data, i32 3
-  ret half %elt
-}
-
-declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
index de929c2..c89c5c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
@@ -23,18 +23,5 @@ main_body:
     ret <3 x float> %vdata.f
 }
 
-
-; GCN-LABEL: {{^}}tbuffer_load_format_immoffs_x3:
-; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42
-; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42
-define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(<4 x i32> inreg) {
-main_body:
-    %vdata   = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast <3 x i32> %vdata to <3 x float>
-    ret <3 x float> %vdata.f
-}
-
 declare <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32)
 declare <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32)
-declare <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
deleted file mode 100644
index eb0dc37..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}tbuffer_load:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT]
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; GCN: s_waitcnt
-define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
-main_body:
-    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
-    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0)
-    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1)
-    %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
-    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
-    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
-    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
-    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
-    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
-    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
-    %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3
-    ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_immoffs:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42
-define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
-main_body:
-    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
-    ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_immoffs_large
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1
-; GCN: s_waitcnt
-define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) {
-    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0)
-    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0)
-    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0)
-    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
-    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
-    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
-    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
-    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
-    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
-    ret {<4 x float>, <4 x float>, <4 x float>} %r2
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_idx:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen
-define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
-main_body:
-    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
-    ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_ofs:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen
-define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
-main_body:
-    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
-    ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_ofs_imm:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52
-define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) {
-main_body:
-    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
-    ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_both:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen
-define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) {
-main_body:
-    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
-    ret <4 x float> %vdata.f
-}
-
-
-; GCN-LABEL: {{^}}buffer_load_xy:
-; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT]
-define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
-    %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
-    ret <2 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}buffer_load_x:
-; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT]
-define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
-    %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
-    %vdata.f = bitcast i32 %vdata to float
-    ret float %vdata.f
-}
-
-declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
deleted file mode 100644
index 16d7de6..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,UNPACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,PACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,PACKED %s
-
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_x:
-; GCN: s_load_dword s[[S_LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
-; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) {
-main_body:
-  call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
-; GCN: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}}
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-
-; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
-  call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; UNPACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-
-; PACKED-DAG: s_and_b32 [[SHR0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR0]]
-; PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %vindex) {
-main_body:
-  call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; UNPACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-
-; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-; PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
-main_body:
-  call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
-  ret void
-}
-
-declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
index 83b5b0c..d5cbadd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
@@ -20,16 +20,6 @@ main_body:
   ret void
 }
 
-; GCN-LABEL: {{^}}tbuffer_store_immoffs_x3:
-; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42
-define amdgpu_ps void @tbuffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) {
-main_body:
-  %in1 = bitcast <3 x float> %1 to <3 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
-  ret void
-}
-
 declare void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32) #0
 declare void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
-declare void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
deleted file mode 100644
index b3a135a..0000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}tbuffer_store:
-; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED]
-; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc
-; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc
-; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT]
-define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
-main_body:
-  %in1 = bitcast <4 x float> %1 to <4 x i32>
-  %in2 = bitcast <4 x float> %2 to <4 x i32>
-  %in3 = bitcast <4 x float> %3 to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0)
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0)
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1)
-  call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_immoffs:
-; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42
-define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
-main_body:
-  %in1 = bitcast <4 x float> %1 to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
-; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42
-define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
-main_body:
-  %in1 = bitcast <4 x float> %vdata to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_idx:
-; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) {
-main_body:
-  %in1 = bitcast <4 x float> %vdata to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_ofs:
-; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
-main_body:
-  %in1 = bitcast <4 x float> %vdata to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_both:
-; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) {
-main_body:
-  %in1 = bitcast <4 x float> %vdata to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0)
-  ret void
-}
-
-; Ideally, the register allocator would avoid the wait here
-;
-; GCN-LABEL: {{^}}buffer_store_wait:
-; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
-; VERDE: s_waitcnt expcnt(0)
-; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
-; GCN: s_waitcnt vmcnt(0)
-; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) {
-main_body:
-  %in1 = bitcast <4 x float> %vdata to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
-  %data.i = bitcast <4 x float> %data to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_x1:
-; GCN: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
-  %data.i = bitcast float %data to i32
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_x2:
-; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) {
-main_body:
-  %data.i = bitcast <2 x float> %data to <2 x i32>
-  call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
-  ret void
-}
-
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
-
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
index 462ab38..1f7de73 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -12,11 +12,11 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; R600: LDS_READ
 ; R600: LDS_READ
 
-; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
-; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1
-; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1
-; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1
+; OPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.y()
+; OPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.z()
+; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.x()
+; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.y()
+; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.z()
 
 define amdgpu_kernel void @mova_same_clause(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
@@ -276,7 +276,4 @@ define amdgpu_kernel void @ptrtoint(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   ret void
 }
 
-; OPT: !0 = !{i32 0, i32 257}
-; OPT: !1 = !{i32 0, i32 256}
-
 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
index ada1b84..778fe90 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
@@ -5,9 +5,9 @@
 
 ; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
 ; CHECK: call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK: call i32 @llvm.amdgcn.workitem.id.x(), !range !2
-; CHECK: call i32 @llvm.amdgcn.workitem.id.y(), !range !2
-; CHECK: call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+; CHECK: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
 define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %tmp = alloca [2 x i32], addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index efc11bf..f566562 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -8,22 +8,22 @@
 define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[B:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], [[PTR1]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
@@ -50,21 +50,21 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr add
 define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], null
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
@@ -89,21 +89,21 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %o
 define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) null, [[PTR0]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll
index 0d6e987..ba04cdb 100644
--- a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll
@@ -14,9 +14,8 @@ define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_cvt_f32_f16_e64 v2, s4
 ; CHECK-NEXT:  ; %bb.1: ; %bb
-; CHECK-NEXT:    v_cvt_f16_f32_e64 v2, v2
+; CHECK-NEXT:    v_cvt_f16_f32_e64 v2, s4
 ; CHECK-NEXT:    s_mov_b32 s7, 0xf000
 ; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s6
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
index 5386ef4..64d4a0c 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
@@ -17,7 +17,7 @@ define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x,
 ; GCN-NEXT:    s_mov_b32 exec_lo, s2
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if
-; GCN-NEXT:    s_mov_b32 s2, 2.0
+; GCN-NEXT:    s_mov_b32 s2, 0x40400000
 ; GCN-NEXT:    v_div_scale_f32 v1, s3, s2, s2, v0
 ; GCN-NEXT:    v_rcp_f32_e64 v2, v1
 ; GCN-NEXT:    s_mov_b32 s3, 1.0
@@ -39,7 +39,7 @@ entry:
   br i1 %cc, label %if, label %end
 
 if:
-  %v.if = fdiv float %v, 2.0
+  %v.if = fdiv float %v, 3.0
   br label %end
 
 end:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index 769db21..85c34e0 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -14,12 +14,12 @@ target datalayout = "A5"
 ; GCN-ALLOCA-COUNT-4: buffer_store_dword
 ; GCN-ALLOCA:         buffer_load_dword
 
-; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
 ; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
+; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
+; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
@@ -294,10 +294,10 @@ entry:
 
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
 ; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
-; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
+; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
+; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
diff --git a/llvm/test/CodeGen/ARM/arm-half-promote.ll b/llvm/test/CodeGen/ARM/arm-half-promote.ll
index a5fafd4..e1ab75b 100644
--- a/llvm/test/CodeGen/ARM/arm-half-promote.ll
+++ b/llvm/test/CodeGen/ARM/arm-half-promote.ll
@@ -116,9 +116,7 @@ define fastcc { <8 x half>, <8 x half> } @f3() {
 
 define void @extract_insert(ptr %dst) optnone noinline {
 ; CHECK-LABEL: extract_insert:
-; CHECK: movs r1, #0
-; CHECK: vmov s0, r1
-; CHECK: vcvtb.f32.f16 s0, s0
+; CHECK: vmov.i32 d0, #0x0
 ; CHECK: vcvtb.f16.f32 s0, s0
 ; CHECK: vmov r1, s0
 ; CHECK: strh r1, [r0]
diff --git a/llvm/test/CodeGen/ARM/frem-power2.ll b/llvm/test/CodeGen/ARM/frem-power2.ll
index 71c2c09..63ecd9f 100644
--- a/llvm/test/CodeGen/ARM/frem-power2.ll
+++ b/llvm/test/CodeGen/ARM/frem-power2.ll
@@ -14,26 +14,28 @@ define float @frem4(float %x) {
 ;
 ; CHECK-FP-LABEL: frem4:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov.f32 s0, #4.000000e+00
-; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vmov.f32 s0, #2.500000e-01
+; CHECK-FP-NEXT:    vmov.f32 s2, #-4.000000e+00
+; CHECK-FP-NEXT:    vmov s4, r0
 ; CHECK-FP-NEXT:    lsrs r0, r0, #31
-; CHECK-FP-NEXT:    vdiv.f32 s4, s2, s0
-; CHECK-FP-NEXT:    vrintz.f32 s4, s4
-; CHECK-FP-NEXT:    vfms.f32 s2, s4, s0
-; CHECK-FP-NEXT:    vmov r1, s2
+; CHECK-FP-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-FP-NEXT:    vrintz.f32 s0, s0
+; CHECK-FP-NEXT:    vfma.f32 s4, s0, s2
+; CHECK-FP-NEXT:    vmov r1, s4
 ; CHECK-FP-NEXT:    bfi r1, r0, #31, #1
 ; CHECK-FP-NEXT:    mov r0, r1
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-M33-LABEL: frem4:
 ; CHECK-M33:       @ %bb.0: @ %entry
-; CHECK-M33-NEXT:    vmov.f32 s0, #4.000000e+00
-; CHECK-M33-NEXT:    vmov s2, r0
+; CHECK-M33-NEXT:    vmov.f32 s0, #2.500000e-01
+; CHECK-M33-NEXT:    vmov.f32 s2, #-4.000000e+00
+; CHECK-M33-NEXT:    vmov s4, r0
 ; CHECK-M33-NEXT:    lsrs r0, r0, #31
-; CHECK-M33-NEXT:    vdiv.f32 s4, s2, s0
-; CHECK-M33-NEXT:    vrintz.f32 s4, s4
-; CHECK-M33-NEXT:    vmls.f32 s2, s4, s0
-; CHECK-M33-NEXT:    vmov r1, s2
+; CHECK-M33-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-M33-NEXT:    vrintz.f32 s0, s0
+; CHECK-M33-NEXT:    vmla.f32 s4, s0, s2
+; CHECK-M33-NEXT:    vmov r1, s4
 ; CHECK-M33-NEXT:    bfi r1, r0, #31, #1
 ; CHECK-M33-NEXT:    mov r0, r1
 ; CHECK-M33-NEXT:    bx lr
@@ -53,22 +55,24 @@ define float @frem4_nsz(float %x) {
 ;
 ; CHECK-FP-LABEL: frem4_nsz:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov.f32 s0, #4.000000e+00
-; CHECK-FP-NEXT:    vmov s2, r0
-; CHECK-FP-NEXT:    vdiv.f32 s4, s2, s0
-; CHECK-FP-NEXT:    vrintz.f32 s4, s4
-; CHECK-FP-NEXT:    vfms.f32 s2, s4, s0
-; CHECK-FP-NEXT:    vmov r0, s2
+; CHECK-FP-NEXT:    vmov.f32 s0, #2.500000e-01
+; CHECK-FP-NEXT:    vmov.f32 s2, #-4.000000e+00
+; CHECK-FP-NEXT:    vmov s4, r0
+; CHECK-FP-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-FP-NEXT:    vrintz.f32 s0, s0
+; CHECK-FP-NEXT:    vfma.f32 s4, s0, s2
+; CHECK-FP-NEXT:    vmov r0, s4
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-M33-LABEL: frem4_nsz:
 ; CHECK-M33:       @ %bb.0: @ %entry
-; CHECK-M33-NEXT:    vmov.f32 s0, #4.000000e+00
-; CHECK-M33-NEXT:    vmov s2, r0
-; CHECK-M33-NEXT:    vdiv.f32 s4, s2, s0
-; CHECK-M33-NEXT:    vrintz.f32 s4, s4
-; CHECK-M33-NEXT:    vmls.f32 s2, s4, s0
-; CHECK-M33-NEXT:    vmov r0, s2
+; CHECK-M33-NEXT:    vmov.f32 s0, #2.500000e-01
+; CHECK-M33-NEXT:    vmov.f32 s2, #-4.000000e+00
+; CHECK-M33-NEXT:    vmov s4, r0
+; CHECK-M33-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-M33-NEXT:    vrintz.f32 s0, s0
+; CHECK-M33-NEXT:    vmla.f32 s4, s0, s2
+; CHECK-M33-NEXT:    vmov r0, s4
 ; CHECK-M33-NEXT:    bx lr
 entry:
   %fmod = frem nsz float %x, 4.0
diff --git a/llvm/test/CodeGen/ARM/vdiv_combine.ll b/llvm/test/CodeGen/ARM/vdiv_combine.ll
index 9888446..899487f 100644
--- a/llvm/test/CodeGen/ARM/vdiv_combine.ll
+++ b/llvm/test/CodeGen/ARM/vdiv_combine.ll
@@ -5,10 +5,7 @@
 define arm_aapcs_vfpcc <2 x float> @t1(<2 x i32> %vecinit2.i) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s2, #8.000000e+00
-; CHECK-NEXT:    vcvt.f32.s32 d2, d0
-; CHECK-NEXT:    vdiv.f32 s1, s5, s2
-; CHECK-NEXT:    vdiv.f32 s0, s4, s2
+; CHECK-NEXT:    vcvt.f32.s32 d0, d0, #3
 ; CHECK-NEXT:    bx lr
 entry:
   %vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -20,10 +17,7 @@ entry:
 define arm_aapcs_vfpcc <2 x float> @t2(<2 x i32> %vecinit2.i) nounwind {
 ; CHECK-LABEL: t2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s2, #8.000000e+00
-; CHECK-NEXT:    vcvt.f32.u32 d2, d0
-; CHECK-NEXT:    vdiv.f32 s1, s5, s2
-; CHECK-NEXT:    vdiv.f32 s0, s4, s2
+; CHECK-NEXT:    vcvt.f32.u32 d0, d0, #3
 ; CHECK-NEXT:    bx lr
 entry:
   %vcvt.i = uitofp <2 x i32> %vecinit2.i to <2 x float>
@@ -56,17 +50,10 @@ entry:
 define arm_aapcs_vfpcc <2 x float> @t4(<2 x i32> %vecinit2.i) nounwind {
 ; CHECK-LABEL: t4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvt.f32.s32 d2, d0
-; CHECK-NEXT:    vldr s2, LCPI3_0
-; CHECK-NEXT:    vdiv.f32 s1, s5, s2
-; CHECK-NEXT:    vdiv.f32 s0, s4, s2
+; CHECK-NEXT:    vcvt.f32.s32 d16, d0
+; CHECK-NEXT:    vmov.i32 d17, #0x2f000000
+; CHECK-NEXT:    vmul.f32 d0, d16, d17
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:    .data_region
-; CHECK-NEXT:  LCPI3_0:
-; CHECK-NEXT:    .long 0x50000000 @ float 8.58993459E+9
-; CHECK-NEXT:    .end_data_region
 entry:
   %vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
   %div.i = fdiv <2 x float> %vcvt.i, <float 0x4200000000000000, float 0x4200000000000000>
@@ -77,17 +64,8 @@ entry:
 define arm_aapcs_vfpcc <2 x float> @t5(<2 x i32> %vecinit2.i) nounwind {
 ; CHECK-LABEL: t5:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvt.f32.s32 d2, d0
-; CHECK-NEXT:    vldr s2, LCPI4_0
-; CHECK-NEXT:    vdiv.f32 s1, s5, s2
-; CHECK-NEXT:    vdiv.f32 s0, s4, s2
+; CHECK-NEXT:    vcvt.f32.s32 d0, d0, #32
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:    .data_region
-; CHECK-NEXT:  LCPI4_0:
-; CHECK-NEXT:    .long 0x4f800000 @ float 4.2949673E+9
-; CHECK-NEXT:    .end_data_region
 entry:
   %vcvt.i = sitofp <2 x i32> %vecinit2.i to <2 x float>
   %div.i = fdiv <2 x float> %vcvt.i, <float 0x41F0000000000000, float 0x41F0000000000000>
@@ -98,12 +76,7 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @t6(<4 x i32> %vecinit6.i) nounwind {
 ; CHECK-LABEL: t6:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, #8.000000e+00
-; CHECK-NEXT:    vcvt.f32.s32 q2, q0
-; CHECK-NEXT:    vdiv.f32 s3, s11, s4
-; CHECK-NEXT:    vdiv.f32 s2, s10, s4
-; CHECK-NEXT:    vdiv.f32 s1, s9, s4
-; CHECK-NEXT:    vdiv.f32 s0, s8, s4
+; CHECK-NEXT:    vcvt.f32.s32 q0, q0, #3
 ; CHECK-NEXT:    bx lr
 entry:
   %vcvt.i = sitofp <4 x i32> %vecinit6.i to <4 x float>
@@ -115,12 +88,7 @@ define arm_aapcs_vfpcc <4 x float> @fix_unsigned_i16_to_float(<4 x i16> %in) {
 ; CHECK-LABEL: fix_unsigned_i16_to_float:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmovl.u16 q8, d0
-; CHECK-NEXT:    vmov.f32 s4, #2.000000e+00
-; CHECK-NEXT:    vcvt.f32.u32 q2, q8
-; CHECK-NEXT:    vdiv.f32 s3, s11, s4
-; CHECK-NEXT:    vdiv.f32 s2, s10, s4
-; CHECK-NEXT:    vdiv.f32 s1, s9, s4
-; CHECK-NEXT:    vdiv.f32 s0, s8, s4
+; CHECK-NEXT:    vcvt.f32.u32 q0, q8, #1
 ; CHECK-NEXT:    bx lr
   %conv = uitofp <4 x i16> %in to <4 x float>
   %shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
@@ -131,12 +99,7 @@ define arm_aapcs_vfpcc <4 x float> @fix_signed_i16_to_float(<4 x i16> %in) {
 ; CHECK-LABEL: fix_signed_i16_to_float:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmovl.s16 q8, d0
-; CHECK-NEXT:    vmov.f32 s4, #2.000000e+00
-; CHECK-NEXT:    vcvt.f32.s32 q2, q8
-; CHECK-NEXT:    vdiv.f32 s3, s11, s4
-; CHECK-NEXT:    vdiv.f32 s2, s10, s4
-; CHECK-NEXT:    vdiv.f32 s1, s9, s4
-; CHECK-NEXT:    vdiv.f32 s0, s8, s4
+; CHECK-NEXT:    vcvt.f32.s32 q0, q8, #1
 ; CHECK-NEXT:    bx lr
   %conv = sitofp <4 x i16> %in to <4 x float>
   %shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
@@ -152,13 +115,12 @@ define arm_aapcs_vfpcc <2 x float> @fix_i64_to_float(<2 x i64> %in) {
 ; CHECK-NEXT:    vmov r0, r1, d9
 ; CHECK-NEXT:    bl ___floatundisf
 ; CHECK-NEXT:    vmov r2, r1, d8
-; CHECK-NEXT:    vmov s18, r0
-; CHECK-NEXT:    vmov.f32 s16, #2.000000e+00
+; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    vmov.i32 d8, #0x3f000000
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl ___floatundisf
-; CHECK-NEXT:    vmov s2, r0
-; CHECK-NEXT:    vdiv.f32 s1, s18, s16
-; CHECK-NEXT:    vdiv.f32 s0, s2, s16
+; CHECK-NEXT:    vmov s18, r0
+; CHECK-NEXT:    vmul.f32 d0, d9, d8
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {lr}
 ; CHECK-NEXT:    bx lr
@@ -177,13 +139,13 @@ define arm_aapcs_vfpcc <2 x double> @fix_i64_to_double(<2 x i64> %in) {
 ; CHECK-NEXT:    bl ___floatundidf
 ; CHECK-NEXT:    vmov r2, r3, d8
 ; CHECK-NEXT:    vmov d9, r0, r1
-; CHECK-NEXT:    vmov.f64 d8, #2.000000e+00
+; CHECK-NEXT:    vmov.f64 d8, #5.000000e-01
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl ___floatundidf
 ; CHECK-NEXT:    vmov d16, r0, r1
-; CHECK-NEXT:    vdiv.f64 d1, d9, d8
-; CHECK-NEXT:    vdiv.f64 d0, d16, d8
+; CHECK-NEXT:    vmul.f64 d1, d9, d8
+; CHECK-NEXT:    vmul.f64 d0, d16, d8
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {lr}
 ; CHECK-NEXT:    bx lr
@@ -196,19 +158,8 @@ define arm_aapcs_vfpcc <2 x double> @fix_i64_to_double(<2 x i64> %in) {
 define arm_aapcs_vfpcc <8 x float> @test7(<8 x i32> %in) nounwind {
 ; CHECK-LABEL: test7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s12, #8.000000e+00
-; CHECK-NEXT:    vcvt.f32.s32 q4, q0
-; CHECK-NEXT:    vcvt.f32.s32 q2, q1
-; CHECK-NEXT:    vdiv.f32 s3, s19, s12
-; CHECK-NEXT:    vdiv.f32 s7, s11, s12
-; CHECK-NEXT:    vdiv.f32 s2, s18, s12
-; CHECK-NEXT:    vdiv.f32 s6, s10, s12
-; CHECK-NEXT:    vdiv.f32 s1, s17, s12
-; CHECK-NEXT:    vdiv.f32 s5, s9, s12
-; CHECK-NEXT:    vdiv.f32 s0, s16, s12
-; CHECK-NEXT:    vdiv.f32 s4, s8, s12
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vcvt.f32.s32 q0, q0, #3
+; CHECK-NEXT:    vcvt.f32.s32 q1, q1, #3
 ; CHECK-NEXT:    bx lr
 entry:
   %vcvt.i = sitofp <8 x i32> %in to <8 x float>
@@ -220,19 +171,8 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @test8(<4 x i32> %in) {
 ; CHECK-LABEL: test8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.f32 s4, #2.000000e+00
-; CHECK-NEXT:    vcvt.f32.s32 q2, q0
-; CHECK-NEXT:    vdiv.f32 s2, s10, s4
-; CHECK-NEXT:    vdiv.f32 s1, s9, s4
-; CHECK-NEXT:    vdiv.f32 s0, s8, s4
-; CHECK-NEXT:    vldr s3, LCPI11_0
+; CHECK-NEXT:    vcvt.f32.s32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:    .data_region
-; CHECK-NEXT:  LCPI11_0:
-; CHECK-NEXT:    .long 0x7fc00000 @ float NaN
-; CHECK-NEXT:    .end_data_region
   %vcvt.i = sitofp <4 x i32> %in to <4 x float>
   %div.i = fdiv <4 x float> %vcvt.i, <float 2.0, float 2.0, float 2.0, float undef>
   ret <4 x float> %div.i
@@ -241,19 +181,8 @@ define arm_aapcs_vfpcc <4 x float> @test8(<4 x i32> %in) {
 define arm_aapcs_vfpcc <3 x float> @test_illegal_int_to_fp(<3 x i32> %in) {
 ; CHECK-LABEL: test_illegal_int_to_fp:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.f32 s4, #4.000000e+00
-; CHECK-NEXT:    vcvt.f32.s32 q2, q0
-; CHECK-NEXT:    vdiv.f32 s2, s10, s4
-; CHECK-NEXT:    vdiv.f32 s1, s9, s4
-; CHECK-NEXT:    vdiv.f32 s0, s8, s4
-; CHECK-NEXT:    vldr s3, LCPI12_0
+; CHECK-NEXT:    vcvt.f32.s32 q0, q0, #2
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:    .data_region
-; CHECK-NEXT:  LCPI12_0:
-; CHECK-NEXT:    .long 0x7fc00000 @ float NaN
-; CHECK-NEXT:    .end_data_region
   %conv = sitofp <3 x i32> %in to <3 x float>
   %res = fdiv <3 x float> %conv, <float 4.0, float 4.0, float 4.0>
   ret <3 x float> %res
diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll
index a8a1031..a0a801d 100644
--- a/llvm/test/CodeGen/ARM/vector-store.ll
+++ b/llvm/test/CodeGen/ARM/vector-store.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
-target triple = "thumbv7-none-eabi"
+; RUN: llc < %s -mtriple=thumbv7-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+; RUN: llc < %s -mtriple=thumbebv7-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 
 define void @store_v8i8(ptr %ptr, <8 x i8> %val) {
 ; CHECK-LABEL: store_v8i8:
@@ -11,24 +9,33 @@ define void @store_v8i8(ptr %ptr, <8 x i8> %val) {
 ; CHECK-NEXT:    str r3, [r0, #4]
 ; CHECK-NEXT:    str r2, [r0]
 ; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <8 x i8> %val, ptr %A, align 1
-	ret void
+    %A = load ptr, ptr %ptr
+    store  <8 x i8> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v8i8_update(ptr %ptr, <8 x i8> %val) {
-; CHECK-LABEL: store_v8i8_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <8 x i8> %val, ptr %A, align 1
-	%inc = getelementptr <8 x i8>, ptr %A, i38 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v8i8_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v8i8_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 d16, d16
+; CHECK-BE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <8 x i8> %val, ptr %A, align 1
+    %inc = getelementptr <8 x i8>, ptr %A, i38 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v4i16(ptr %ptr, <4 x i16> %val) {
@@ -38,24 +45,33 @@ define void @store_v4i16(ptr %ptr, <4 x i16> %val) {
 ; CHECK-NEXT:    str r3, [r0, #4]
 ; CHECK-NEXT:    str r2, [r0]
 ; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <4 x i16> %val, ptr %A, align 1
-	ret void
+    %A = load ptr, ptr %ptr
+    store  <4 x i16> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v4i16_update(ptr %ptr, <4 x i16> %val) {
-; CHECK-LABEL: store_v4i16_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <4 x i16> %val, ptr %A, align 1
-	%inc = getelementptr <4 x i16>, ptr %A, i34 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v4i16_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v4i16_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 d16, d16
+; CHECK-BE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <4 x i16> %val, ptr %A, align 1
+    %inc = getelementptr <4 x i16>, ptr %A, i34 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v2i32(ptr %ptr, <2 x i32> %val) {
@@ -65,24 +81,33 @@ define void @store_v2i32(ptr %ptr, <2 x i32> %val) {
 ; CHECK-NEXT:    str r3, [r0, #4]
 ; CHECK-NEXT:    str r2, [r0]
 ; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i32> %val, ptr %A, align 1
-	ret void
+    %A = load ptr, ptr %ptr
+    store  <2 x i32> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v2i32_update(ptr %ptr, <2 x i32> %val) {
-; CHECK-LABEL: store_v2i32_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i32> %val, ptr %A, align 1
-	%inc = getelementptr <2 x i32>, ptr %A, i32 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v2i32_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2i32_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 d16, d16
+; CHECK-BE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x i32> %val, ptr %A, align 1
+    %inc = getelementptr <2 x i32>, ptr %A, i32 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v2f32(ptr %ptr, <2 x float> %val) {
@@ -92,24 +117,33 @@ define void @store_v2f32(ptr %ptr, <2 x float> %val) {
 ; CHECK-NEXT:    str r3, [r0, #4]
 ; CHECK-NEXT:    str r2, [r0]
 ; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x float> %val, ptr %A, align 1
-	ret void
+    %A = load ptr, ptr %ptr
+    store  <2 x float> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v2f32_update(ptr %ptr, <2 x float> %val) {
-; CHECK-LABEL: store_v2f32_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x float> %val, ptr %A, align 1
-	%inc = getelementptr <2 x float>, ptr %A, i32 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v2f32_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2f32_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 d16, d16
+; CHECK-BE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x float> %val, ptr %A, align 1
+    %inc = getelementptr <2 x float>, ptr %A, i32 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v1i64(ptr %ptr, <1 x i64> %val) {
@@ -119,279 +153,458 @@ define void @store_v1i64(ptr %ptr, <1 x i64> %val) {
 ; CHECK-NEXT:    str r3, [r0, #4]
 ; CHECK-NEXT:    str r2, [r0]
 ; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <1 x i64> %val, ptr %A, align 1
-	ret void
+    %A = load ptr, ptr %ptr
+    store  <1 x i64> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v1i64_update(ptr %ptr, <1 x i64> %val) {
-; CHECK-LABEL: store_v1i64_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <1 x i64> %val, ptr %A, align 1
-	%inc = getelementptr <1 x i64>, ptr %A, i31 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v1i64_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v1i64_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 d16, d16
+; CHECK-BE-NEXT:    vst1.8 {d16}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <1 x i64> %val, ptr %A, align 1
+    %inc = getelementptr <1 x i64>, ptr %A, i31 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v16i8(ptr %ptr, <16 x i8> %val) {
-; CHECK-LABEL: store_v16i8:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <16 x i8> %val, ptr %A, align 1
-	ret void
+; CHECK-LE-LABEL: store_v16i8:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    ldr r0, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v16i8:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r0, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <16 x i8> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v16i8_update(ptr %ptr, <16 x i8> %val) {
-; CHECK-LABEL: store_v16i8_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <16 x i8> %val, ptr %A, align 1
-	%inc = getelementptr <16 x i8>, ptr %A, i316 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v16i8_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v16i8_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <16 x i8> %val, ptr %A, align 1
+    %inc = getelementptr <16 x i8>, ptr %A, i316 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v8i16(ptr %ptr, <8 x i16> %val) {
-; CHECK-LABEL: store_v8i16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <8 x i16> %val, ptr %A, align 1
-	ret void
+; CHECK-LE-LABEL: store_v8i16:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    ldr r0, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v8i16:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r0, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <8 x i16> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v8i16_update(ptr %ptr, <8 x i16> %val) {
-; CHECK-LABEL: store_v8i16_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <8 x i16> %val, ptr %A, align 1
-	%inc = getelementptr <8 x i16>, ptr %A, i38 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v8i16_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v8i16_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <8 x i16> %val, ptr %A, align 1
+    %inc = getelementptr <8 x i16>, ptr %A, i38 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v4i32(ptr %ptr, <4 x i32> %val) {
-; CHECK-LABEL: store_v4i32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <4 x i32> %val, ptr %A, align 1
-	ret void
+; CHECK-LE-LABEL: store_v4i32:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    ldr r0, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v4i32:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r0, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <4 x i32> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v4i32_update(ptr %ptr, <4 x i32> %val) {
-; CHECK-LABEL: store_v4i32_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <4 x i32> %val, ptr %A, align 1
-	%inc = getelementptr <4 x i32>, ptr %A, i34 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v4i32_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v4i32_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <4 x i32> %val, ptr %A, align 1
+    %inc = getelementptr <4 x i32>, ptr %A, i34 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v4f32(ptr %ptr, <4 x float> %val) {
-; CHECK-LABEL: store_v4f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <4 x float> %val, ptr %A, align 1
-	ret void
+; CHECK-LE-LABEL: store_v4f32:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    ldr r0, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v4f32:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r0, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <4 x float> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v4f32_update(ptr %ptr, <4 x float> %val) {
-; CHECK-LABEL: store_v4f32_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <4 x float> %val, ptr %A, align 1
-	%inc = getelementptr <4 x float>, ptr %A, i34 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v4f32_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v4f32_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <4 x float> %val, ptr %A, align 1
+    %inc = getelementptr <4 x float>, ptr %A, i34 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v2i64(ptr %ptr, <2 x i64> %val) {
-; CHECK-LABEL: store_v2i64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i64> %val, ptr %A, align 1
-	ret void
+; CHECK-LE-LABEL: store_v2i64:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    ldr r0, [r0]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2i64:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r0, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x i64> %val, ptr %A, align 1
+    ret void
 }
 
 define void @store_v2i64_update(ptr %ptr, <2 x i64> %val) {
-; CHECK-LABEL: store_v2i64_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.8 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i64> %val, ptr %A, align 1
-	%inc = getelementptr <2 x i64>, ptr %A, i32 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v2i64_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2i64_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.8 q8, q8
+; CHECK-BE-NEXT:    vst1.8 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x i64> %val, ptr %A, align 1
+    %inc = getelementptr <2 x i64>, ptr %A, i32 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v2i64_update_aligned2(ptr %ptr, <2 x i64> %val) {
-; CHECK-LABEL: store_v2i64_update_aligned2:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.16 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i64> %val, ptr %A, align 2
-	%inc = getelementptr <2 x i64>, ptr %A, i32 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v2i64_update_aligned2:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.16 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2i64_update_aligned2:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.16 q8, q8
+; CHECK-BE-NEXT:    vst1.16 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x i64> %val, ptr %A, align 2
+    %inc = getelementptr <2 x i64>, ptr %A, i32 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v2i64_update_aligned4(ptr %ptr, <2 x i64> %val) {
-; CHECK-LABEL: store_v2i64_update_aligned4:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.32 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i64> %val, ptr %A, align 4
-	%inc = getelementptr <2 x i64>, ptr %A, i32 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v2i64_update_aligned4:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.32 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2i64_update_aligned4:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev64.32 q8, q8
+; CHECK-BE-NEXT:    vst1.32 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x i64> %val, ptr %A, align 4
+    %inc = getelementptr <2 x i64>, ptr %A, i32 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v2i64_update_aligned8(ptr %ptr, <2 x i64> %val) {
-; CHECK-LABEL: store_v2i64_update_aligned8:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i64> %val, ptr %A, align 8
-	%inc = getelementptr <2 x i64>, ptr %A, i32 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v2i64_update_aligned8:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.64 {d16, d17}, [r1]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2i64_update_aligned8:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vst1.64 {d16, d17}, [r1]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x i64> %val, ptr %A, align 8
+    %inc = getelementptr <2 x i64>, ptr %A, i32 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @store_v2i64_update_aligned16(ptr %ptr, <2 x i64> %val) {
-; CHECK-LABEL: store_v2i64_update_aligned16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r1:128]!
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-	store  <2 x i64> %val, ptr %A, align 16
-	%inc = getelementptr <2 x i64>, ptr %A, i32 1
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: store_v2i64_update_aligned16:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    vst1.64 {d16, d17}, [r1:128]!
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: store_v2i64_update_aligned16:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vst1.64 {d16, d17}, [r1:128]!
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    store  <2 x i64> %val, ptr %A, align 16
+    %inc = getelementptr <2 x i64>, ptr %A, i32 1
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define void @truncstore_v4i32tov4i8(ptr %ptr, <4 x i32> %val) {
-; CHECK-LABEL: truncstore_v4i32tov4i8:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    vmovn.i32 d16, q8
-; CHECK-NEXT:    vuzp.8 d16, d17
-; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-        %trunc = trunc <4 x i32> %val to <4 x i8>
-	store  <4 x i8> %trunc, ptr %A, align 4
-	ret void
+; CHECK-LE-LABEL: truncstore_v4i32tov4i8:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r0, [r0]
+; CHECK-LE-NEXT:    vmovn.i32 d16, q8
+; CHECK-LE-NEXT:    vuzp.8 d16, d17
+; CHECK-LE-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: truncstore_v4i32tov4i8:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    vrev64.32 q8, q8
+; CHECK-BE-NEXT:    vmovn.i32 d16, q8
+; CHECK-BE-NEXT:    vrev16.8 d16, d16
+; CHECK-BE-NEXT:    vuzp.8 d16, d17
+; CHECK-BE-NEXT:    ldr r0, [r0]
+; CHECK-BE-NEXT:    vrev32.8 d16, d17
+; CHECK-BE-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    %trunc = trunc <4 x i32> %val to <4 x i8>
+    store  <4 x i8> %trunc, ptr %A, align 4
+    ret void
 }
 
 define void @truncstore_v4i32tov4i8_fake_update(ptr %ptr, <4 x i32> %val) {
-; CHECK-LABEL: truncstore_v4i32tov4i8_fake_update:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vmov d16, r2, r3
-; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    movs r2, #16
-; CHECK-NEXT:    vmovn.i32 d16, q8
-; CHECK-NEXT:    vuzp.8 d16, d17
-; CHECK-NEXT:    vst1.32 {d16[0]}, [r1:32], r2
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    bx lr
-	%A = load ptr, ptr %ptr
-        %trunc = trunc <4 x i32> %val to <4 x i8>
-	store  <4 x i8> %trunc, ptr %A, align 4
-	%inc = getelementptr <4 x i8>, ptr %A, i38 4
-        store ptr %inc, ptr %ptr
-	ret void
+; CHECK-LE-LABEL: truncstore_v4i32tov4i8_fake_update:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vldr d17, [sp]
+; CHECK-LE-NEXT:    vmov d16, r2, r3
+; CHECK-LE-NEXT:    ldr r1, [r0]
+; CHECK-LE-NEXT:    movs r2, #16
+; CHECK-LE-NEXT:    vmovn.i32 d16, q8
+; CHECK-LE-NEXT:    vuzp.8 d16, d17
+; CHECK-LE-NEXT:    vst1.32 {d16[0]}, [r1:32], r2
+; CHECK-LE-NEXT:    str r1, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: truncstore_v4i32tov4i8_fake_update:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d17, [sp]
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    movs r2, #16
+; CHECK-BE-NEXT:    vrev64.32 q8, q8
+; CHECK-BE-NEXT:    vmovn.i32 d16, q8
+; CHECK-BE-NEXT:    vrev16.8 d16, d16
+; CHECK-BE-NEXT:    vuzp.8 d16, d17
+; CHECK-BE-NEXT:    ldr r1, [r0]
+; CHECK-BE-NEXT:    vrev32.8 d16, d17
+; CHECK-BE-NEXT:    vst1.32 {d16[0]}, [r1:32], r2
+; CHECK-BE-NEXT:    str r1, [r0]
+; CHECK-BE-NEXT:    bx lr
+    %A = load ptr, ptr %ptr
+    %trunc = trunc <4 x i32> %val to <4 x i8>
+    store  <4 x i8> %trunc, ptr %A, align 4
+    %inc = getelementptr <4 x i8>, ptr %A, i38 4
+    store ptr %inc, ptr %ptr
+    ret void
 }
 
 define ptr @test_vst1_1reg(ptr %ptr.in, ptr %ptr.out) {
-; CHECK-LABEL: test_vst1_1reg:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    movs r0, #32
-; CHECK-NEXT:    vst1.32 {d16, d17}, [r1], r0
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: test_vst1_1reg:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-LE-NEXT:    movs r0, #32
+; CHECK-LE-NEXT:    vst1.32 {d16, d17}, [r1], r0
+; CHECK-LE-NEXT:    mov r0, r1
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_vst1_1reg:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-BE-NEXT:    movs r0, #32
+; CHECK-BE-NEXT:    vrev64.32 q8, q8
+; CHECK-BE-NEXT:    vst1.32 {d16, d17}, [r1], r0
+; CHECK-BE-NEXT:    mov r0, r1
+; CHECK-BE-NEXT:    bx lr
   %val = load <4 x i32>, ptr %ptr.in
   store <4 x i32> %val, ptr %ptr.out
   %next = getelementptr <4 x i32>, ptr %ptr.out, i32 2
@@ -400,37 +613,65 @@ define ptr @test_vst1_1reg(ptr %ptr.in, ptr %ptr.out) {
 
 ; PR56970
 define void @v3i8store(ptr %p) {
-; CHECK-LABEL: v3i8store:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    vmov.i32 d16, #0xff
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vmov.i32 d17, #0x0
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vand d16, d17, d16
-; CHECK-NEXT:    vst1.32 {d16[0]}, [r1:32]
-; CHECK-NEXT:    vld1.32 {d16[0]}, [r1:32]
-; CHECK-NEXT:    vmovl.u16 q8, d16
-; CHECK-NEXT:    strb r2, [r0, #2]
-; CHECK-NEXT:    vmov.32 r1, d16[0]
-; CHECK-NEXT:    strh r1, [r0]
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: v3i8store:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    .pad #4
+; CHECK-LE-NEXT:    sub sp, #4
+; CHECK-LE-NEXT:    movs r1, #0
+; CHECK-LE-NEXT:    mov r2, sp
+; CHECK-LE-NEXT:    str r1, [sp]
+; CHECK-LE-NEXT:    vld1.32 {d16[0]}, [r2:32]
+; CHECK-LE-NEXT:    strb r1, [r0, #2]
+; CHECK-LE-NEXT:    vmovl.u16 q8, d16
+; CHECK-LE-NEXT:    vmov.32 r2, d16[0]
+; CHECK-LE-NEXT:    strh r2, [r0]
+; CHECK-LE-NEXT:    add sp, #4
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: v3i8store:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    .pad #4
+; CHECK-BE-NEXT:    sub sp, #4
+; CHECK-BE-NEXT:    movs r1, #0
+; CHECK-BE-NEXT:    mov r2, sp
+; CHECK-BE-NEXT:    str r1, [sp]
+; CHECK-BE-NEXT:    vld1.32 {d16[0]}, [r2:32]
+; CHECK-BE-NEXT:    strb r1, [r0, #2]
+; CHECK-BE-NEXT:    vrev32.16 d16, d16
+; CHECK-BE-NEXT:    vmovl.u16 q8, d16
+; CHECK-BE-NEXT:    vmov.32 r2, d16[0]
+; CHECK-BE-NEXT:    strh r2, [r0]
+; CHECK-BE-NEXT:    add sp, #4
+; CHECK-BE-NEXT:    bx lr
   store <3 x i8> zeroinitializer, ptr %p, align 4
   ret void
 }
 
 define void @v3i64shuffle(ptr %p, <3 x i64> %a) {
-; CHECK-LABEL: v3i64shuffle:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.i32 q8, #0x0
-; CHECK-NEXT:    ldrd r12, r1, [sp, #8]
-; CHECK-NEXT:    vmov d18, r2, r3
-; CHECK-NEXT:    vorr d19, d16, d16
-; CHECK-NEXT:    str r1, [r0, #20]
-; CHECK-NEXT:    vst1.32 {d18, d19}, [r0]!
-; CHECK-NEXT:    str.w r12, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-LE-LABEL: v3i64shuffle:
+; CHECK-LE:       @ %bb.0:
+; CHECK-LE-NEXT:    vmov.i32 q8, #0x0
+; CHECK-LE-NEXT:    ldrd r12, r1, [sp, #8]
+; CHECK-LE-NEXT:    vmov d18, r2, r3
+; CHECK-LE-NEXT:    vorr d19, d16, d16
+; CHECK-LE-NEXT:    str r1, [r0, #20]
+; CHECK-LE-NEXT:    vst1.32 {d18, d19}, [r0]!
+; CHECK-LE-NEXT:    str.w r12, [r0]
+; CHECK-LE-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: v3i64shuffle:
+; CHECK-BE:       @ %bb.0:
+; CHECK-BE-NEXT:    vldr d16, [sp, #8]
+; CHECK-BE-NEXT:    vmov.i32 q9, #0x0
+; CHECK-BE-NEXT:    vrev64.32 q8, q8
+; CHECK-BE-NEXT:    vmov r12, r1, d16
+; CHECK-BE-NEXT:    vmov d16, r3, r2
+; CHECK-BE-NEXT:    vorr d17, d18, d18
+; CHECK-BE-NEXT:    vrev64.32 q8, q8
+; CHECK-BE-NEXT:    str r1, [r0, #20]
+; CHECK-BE-NEXT:    vst1.32 {d16, d17}, [r0]!
+; CHECK-BE-NEXT:    str.w r12, [r0]
+; CHECK-BE-NEXT:    bx lr
   %b = shufflevector <3 x i64> %a, <3 x i64> zeroinitializer, <3 x i32> <i32 0, i32 3, i32 2>
   store <3 x i64> %b, ptr %p, align 4
   ret void
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll
new file mode 100644
index 0000000..04fa62b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-half.ll
@@ -0,0 +1,1066 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv32 -mattr=+f -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IF %s
+; RUN: llc -mtriple=riscv32 -mattr=+zfh -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IZFH %s
+; RUN: llc -mtriple=riscv64 -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+f -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64IF %s
+; RUN: llc -mtriple=riscv64 -mattr=+zfh -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64IZFH %s
+
+define half @callee_half_in_regs(half %x) nounwind {
+  ; RV32I-LABEL: name: callee_half_in_regs
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: callee_half_in_regs
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $f10_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: callee_half_in_regs
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $f10_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: callee_half_in_regs
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: callee_half_in_regs
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $f10_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: callee_half_in_regs
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $f10_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  ret half %x
+}
+
+define half @caller_half_in_regs(half %x) nounwind {
+  ; RV32I-LABEL: name: caller_half_in_regs
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32I-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT]](s32)
+  ; RV32I-NEXT:   PseudoCALL target-flags(riscv-call) @caller_half_in_regs, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit-def $x10
+  ; RV32I-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT1]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: caller_half_in_regs
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $f10_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   PseudoCALL target-flags(riscv-call) @caller_half_in_regs, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_f, implicit-def $f10_f
+  ; RV32IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT1]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: caller_half_in_regs
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $f10_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV32IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @caller_half_in_regs, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_h, implicit-def $f10_h
+  ; RV32IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY1]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: caller_half_in_regs
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   PseudoCALL target-flags(riscv-call) @caller_half_in_regs, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit-def $x10
+  ; RV64I-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+  ; RV64I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT1]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: caller_half_in_regs
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $f10_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV64IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV64IF-NEXT:   PseudoCALL target-flags(riscv-call) @caller_half_in_regs, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_f, implicit-def $f10_f
+  ; RV64IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT1]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: caller_half_in_regs
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $f10_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV64IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @caller_half_in_regs, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_h, implicit-def $f10_h
+  ; RV64IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY1]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  %y = call half @caller_half_in_regs(half %x)
+  ret half %y
+}
+
+define half @callee_half_mixed_with_int(i32 %x0, half %x) nounwind {
+  ; RV32I-LABEL: name: callee_half_mixed_with_int
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10, $x11
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: callee_half_mixed_with_int
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $x10, $f10_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: callee_half_mixed_with_int
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $x10, $f10_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY1]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: callee_half_mixed_with_int
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10, $x11
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: callee_half_mixed_with_int
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $x10, $f10_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: callee_half_mixed_with_int
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $x10, $f10_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IZFH-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY1]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  ret half %x
+}
+
+define half @caller_half_mixed_with_int(half %x, i32 %x0) nounwind {
+  ; RV32I-LABEL: name: caller_half_mixed_with_int
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10, $x11
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32I-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32I-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[COPY1]](s32)
+  ; RV32I-NEXT:   $x11 = COPY [[ANYEXT]](s32)
+  ; RV32I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_mixed_with_int, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+  ; RV32I-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV32I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT1]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: caller_half_mixed_with_int
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $x10, $f10_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   $x10 = COPY [[COPY1]](s32)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_mixed_with_int, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $f10_f, implicit-def $f10_f
+  ; RV32IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT1]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: caller_half_mixed_with_int
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $x10, $f10_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   $x10 = COPY [[COPY1]](s32)
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV32IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_mixed_with_int, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $f10_h, implicit-def $f10_h
+  ; RV32IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY2]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: caller_half_mixed_with_int
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10, $x11
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; RV64I-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s32)
+  ; RV64I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   $x11 = COPY [[ANYEXT1]](s64)
+  ; RV64I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_mixed_with_int, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+  ; RV64I-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s64)
+  ; RV64I-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT2]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: caller_half_mixed_with_int
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $x10, $f10_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV64IF-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; RV64IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT1]](s32)
+  ; RV64IF-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_mixed_with_int, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $f10_f, implicit-def $f10_f
+  ; RV64IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT2]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: caller_half_mixed_with_int
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $x10, $f10_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IZFH-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; RV64IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s32)
+  ; RV64IZFH-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV64IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_mixed_with_int, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $f10_h, implicit-def $f10_h
+  ; RV64IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY2]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  %y = call half @callee_half_mixed_with_int(i32 %x0, half %x)
+  ret half %y
+}
+
+define half @callee_half_return_stack1(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, half %x) nounwind {
+  ; RV32I-LABEL: name: callee_half_return_stack1
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32I-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+  ; RV32I-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+  ; RV32I-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $x14
+  ; RV32I-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $x15
+  ; RV32I-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $x16
+  ; RV32I-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $x17
+  ; RV32I-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; RV32I-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 16)
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: callee_half_return_stack1
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $f10_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+  ; RV32IF-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+  ; RV32IF-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $x14
+  ; RV32IF-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $x15
+  ; RV32IF-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $x16
+  ; RV32IF-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $x17
+  ; RV32IF-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: callee_half_return_stack1
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $f10_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+  ; RV32IZFH-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+  ; RV32IZFH-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $x14
+  ; RV32IZFH-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $x15
+  ; RV32IZFH-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $x16
+  ; RV32IZFH-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $x17
+  ; RV32IZFH-NEXT:   [[COPY8:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY8]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: callee_half_return_stack1
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; RV64I-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x12
+  ; RV64I-NEXT:   [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+  ; RV64I-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x13
+  ; RV64I-NEXT:   [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY3]](s64)
+  ; RV64I-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $x14
+  ; RV64I-NEXT:   [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+  ; RV64I-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $x15
+  ; RV64I-NEXT:   [[TRUNC5:%[0-9]+]]:_(s32) = G_TRUNC [[COPY5]](s64)
+  ; RV64I-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $x16
+  ; RV64I-NEXT:   [[TRUNC6:%[0-9]+]]:_(s32) = G_TRUNC [[COPY6]](s64)
+  ; RV64I-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $x17
+  ; RV64I-NEXT:   [[TRUNC7:%[0-9]+]]:_(s32) = G_TRUNC [[COPY7]](s64)
+  ; RV64I-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; RV64I-NEXT:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s64) from %fixed-stack.0, align 16)
+  ; RV64I-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s64)
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC8]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: callee_half_return_stack1
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $f10_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64IF-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; RV64IF-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x12
+  ; RV64IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+  ; RV64IF-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x13
+  ; RV64IF-NEXT:   [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY3]](s64)
+  ; RV64IF-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $x14
+  ; RV64IF-NEXT:   [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+  ; RV64IF-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $x15
+  ; RV64IF-NEXT:   [[TRUNC5:%[0-9]+]]:_(s32) = G_TRUNC [[COPY5]](s64)
+  ; RV64IF-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $x16
+  ; RV64IF-NEXT:   [[TRUNC6:%[0-9]+]]:_(s32) = G_TRUNC [[COPY6]](s64)
+  ; RV64IF-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $x17
+  ; RV64IF-NEXT:   [[TRUNC7:%[0-9]+]]:_(s32) = G_TRUNC [[COPY7]](s64)
+  ; RV64IF-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC8]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: callee_half_return_stack1
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $f10_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IZFH-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64IZFH-NEXT:   [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+  ; RV64IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x12
+  ; RV64IZFH-NEXT:   [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+  ; RV64IZFH-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x13
+  ; RV64IZFH-NEXT:   [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY3]](s64)
+  ; RV64IZFH-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $x14
+  ; RV64IZFH-NEXT:   [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+  ; RV64IZFH-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $x15
+  ; RV64IZFH-NEXT:   [[TRUNC5:%[0-9]+]]:_(s32) = G_TRUNC [[COPY5]](s64)
+  ; RV64IZFH-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $x16
+  ; RV64IZFH-NEXT:   [[TRUNC6:%[0-9]+]]:_(s32) = G_TRUNC [[COPY6]](s64)
+  ; RV64IZFH-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $x17
+  ; RV64IZFH-NEXT:   [[TRUNC7:%[0-9]+]]:_(s32) = G_TRUNC [[COPY7]](s64)
+  ; RV64IZFH-NEXT:   [[COPY8:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY8]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  ret half %x
+}
+
+define half @caller_half_return_stack1(i32 %v1, half %x) nounwind {
+  ; RV32I-LABEL: name: caller_half_return_stack1
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10, $x11
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32I-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV32I-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; RV32I-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; RV32I-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; RV32I-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; RV32I-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+  ; RV32I-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; RV32I-NEXT:   ADJCALLSTACKDOWN 4, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+  ; RV32I-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV32I-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY2]], [[C7]](s32)
+  ; RV32I-NEXT:   G_STORE [[ANYEXT]](s32), [[PTR_ADD]](p0) :: (store (s32) into stack, align 16)
+  ; RV32I-NEXT:   $x10 = COPY [[C]](s32)
+  ; RV32I-NEXT:   $x11 = COPY [[C1]](s32)
+  ; RV32I-NEXT:   $x12 = COPY [[C2]](s32)
+  ; RV32I-NEXT:   $x13 = COPY [[COPY]](s32)
+  ; RV32I-NEXT:   $x14 = COPY [[C3]](s32)
+  ; RV32I-NEXT:   $x15 = COPY [[C4]](s32)
+  ; RV32I-NEXT:   $x16 = COPY [[C5]](s32)
+  ; RV32I-NEXT:   $x17 = COPY [[C6]](s32)
+  ; RV32I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack1, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit-def $x10
+  ; RV32I-NEXT:   ADJCALLSTACKUP 4, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+  ; RV32I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT1]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: caller_half_return_stack1
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $x10, $f10_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32IF-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV32IF-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; RV32IF-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; RV32IF-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; RV32IF-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; RV32IF-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+  ; RV32IF-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; RV32IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   $x10 = COPY [[C]](s32)
+  ; RV32IF-NEXT:   $x11 = COPY [[C1]](s32)
+  ; RV32IF-NEXT:   $x12 = COPY [[C2]](s32)
+  ; RV32IF-NEXT:   $x13 = COPY [[COPY]](s32)
+  ; RV32IF-NEXT:   $x14 = COPY [[C3]](s32)
+  ; RV32IF-NEXT:   $x15 = COPY [[C4]](s32)
+  ; RV32IF-NEXT:   $x16 = COPY [[C5]](s32)
+  ; RV32IF-NEXT:   $x17 = COPY [[C6]](s32)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack1, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $f10_f, implicit-def $f10_f
+  ; RV32IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT1]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: caller_half_return_stack1
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $x10, $f10_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV32IZFH-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; RV32IZFH-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; RV32IZFH-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; RV32IZFH-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; RV32IZFH-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+  ; RV32IZFH-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; RV32IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   $x10 = COPY [[C]](s32)
+  ; RV32IZFH-NEXT:   $x11 = COPY [[C1]](s32)
+  ; RV32IZFH-NEXT:   $x12 = COPY [[C2]](s32)
+  ; RV32IZFH-NEXT:   $x13 = COPY [[COPY]](s32)
+  ; RV32IZFH-NEXT:   $x14 = COPY [[C3]](s32)
+  ; RV32IZFH-NEXT:   $x15 = COPY [[C4]](s32)
+  ; RV32IZFH-NEXT:   $x16 = COPY [[C5]](s32)
+  ; RV32IZFH-NEXT:   $x17 = COPY [[C6]](s32)
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY1]](s16)
+  ; RV32IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack1, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $f10_h, implicit-def $f10_h
+  ; RV32IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY2]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: caller_half_return_stack1
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10, $x11
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+  ; RV64I-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV64I-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; RV64I-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; RV64I-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; RV64I-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; RV64I-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+  ; RV64I-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; RV64I-NEXT:   ADJCALLSTACKDOWN 8, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+  ; RV64I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+  ; RV64I-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+  ; RV64I-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s32)
+  ; RV64I-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+  ; RV64I-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s32)
+  ; RV64I-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
+  ; RV64I-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32)
+  ; RV64I-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64I-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+  ; RV64I-NEXT:   [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; RV64I-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY2]], [[C7]](s64)
+  ; RV64I-NEXT:   G_STORE [[ANYEXT8]](s64), [[PTR_ADD]](p0) :: (store (s64) into stack, align 16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   $x11 = COPY [[ANYEXT1]](s64)
+  ; RV64I-NEXT:   $x12 = COPY [[ANYEXT2]](s64)
+  ; RV64I-NEXT:   $x13 = COPY [[ANYEXT3]](s64)
+  ; RV64I-NEXT:   $x14 = COPY [[ANYEXT4]](s64)
+  ; RV64I-NEXT:   $x15 = COPY [[ANYEXT5]](s64)
+  ; RV64I-NEXT:   $x16 = COPY [[ANYEXT6]](s64)
+  ; RV64I-NEXT:   $x17 = COPY [[ANYEXT7]](s64)
+  ; RV64I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack1, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit-def $x10
+  ; RV64I-NEXT:   ADJCALLSTACKUP 8, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s64)
+  ; RV64I-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT9]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: caller_half_return_stack1
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $x10, $f10_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV64IF-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV64IF-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; RV64IF-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; RV64IF-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; RV64IF-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; RV64IF-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+  ; RV64IF-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; RV64IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64IF-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64IF-NEXT:   $x11 = COPY [[ANYEXT1]](s64)
+  ; RV64IF-NEXT:   $x12 = COPY [[ANYEXT2]](s64)
+  ; RV64IF-NEXT:   $x13 = COPY [[ANYEXT3]](s64)
+  ; RV64IF-NEXT:   $x14 = COPY [[ANYEXT4]](s64)
+  ; RV64IF-NEXT:   $x15 = COPY [[ANYEXT5]](s64)
+  ; RV64IF-NEXT:   $x16 = COPY [[ANYEXT6]](s64)
+  ; RV64IF-NEXT:   $x17 = COPY [[ANYEXT7]](s64)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT8]](s32)
+  ; RV64IF-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack1, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $f10_f, implicit-def $f10_f
+  ; RV64IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT9]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: caller_half_return_stack1
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $x10, $f10_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IZFH-NEXT:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; RV64IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV64IZFH-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; RV64IZFH-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; RV64IZFH-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; RV64IZFH-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; RV64IZFH-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+  ; RV64IZFH-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; RV64IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+  ; RV64IZFH-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+  ; RV64IZFH-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+  ; RV64IZFH-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s32)
+  ; RV64IZFH-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+  ; RV64IZFH-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s32)
+  ; RV64IZFH-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
+  ; RV64IZFH-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32)
+  ; RV64IZFH-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64IZFH-NEXT:   $x11 = COPY [[ANYEXT1]](s64)
+  ; RV64IZFH-NEXT:   $x12 = COPY [[ANYEXT2]](s64)
+  ; RV64IZFH-NEXT:   $x13 = COPY [[ANYEXT3]](s64)
+  ; RV64IZFH-NEXT:   $x14 = COPY [[ANYEXT4]](s64)
+  ; RV64IZFH-NEXT:   $x15 = COPY [[ANYEXT5]](s64)
+  ; RV64IZFH-NEXT:   $x16 = COPY [[ANYEXT6]](s64)
+  ; RV64IZFH-NEXT:   $x17 = COPY [[ANYEXT7]](s64)
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY1]](s16)
+  ; RV64IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack1, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $f10_h, implicit-def $f10_h
+  ; RV64IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY2]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  %y = call half @callee_half_return_stack1(i32 0, i32 1, i32 2, i32 %v1, i32 5, i32 6, i32 7, i32 8, half %x)
+  ret half %y
+}
+
+define half @callee_half_return_stack2(half %v1, half %v2, half %v3, half %v4, half %v5, half %v6, half %v7, half %v8, half %x) nounwind {
+  ; RV32I-LABEL: name: callee_half_return_stack2
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32I-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32I-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+  ; RV32I-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV32I-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+  ; RV32I-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+  ; RV32I-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $x14
+  ; RV32I-NEXT:   [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32)
+  ; RV32I-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $x15
+  ; RV32I-NEXT:   [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32)
+  ; RV32I-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $x16
+  ; RV32I-NEXT:   [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY6]](s32)
+  ; RV32I-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $x17
+  ; RV32I-NEXT:   [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY7]](s32)
+  ; RV32I-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; RV32I-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 16)
+  ; RV32I-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC8]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: callee_half_return_stack2
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f, $f14_f, $f15_f, $f16_f, $f17_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f11_f
+  ; RV32IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f12_f
+  ; RV32IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV32IF-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $f13_f
+  ; RV32IF-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+  ; RV32IF-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $f14_f
+  ; RV32IF-NEXT:   [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32)
+  ; RV32IF-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $f15_f
+  ; RV32IF-NEXT:   [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32)
+  ; RV32IF-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $f16_f
+  ; RV32IF-NEXT:   [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY6]](s32)
+  ; RV32IF-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $f17_f
+  ; RV32IF-NEXT:   [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY7]](s32)
+  ; RV32IF-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IF-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC8]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: callee_half_return_stack2
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $x10, $f10_h, $f11_h, $f12_h, $f13_h, $f14_h, $f15_h, $f16_h, $f17_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+  ; RV32IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f12_h
+  ; RV32IZFH-NEXT:   [[COPY3:%[0-9]+]]:_(s16) = COPY $f13_h
+  ; RV32IZFH-NEXT:   [[COPY4:%[0-9]+]]:_(s16) = COPY $f14_h
+  ; RV32IZFH-NEXT:   [[COPY5:%[0-9]+]]:_(s16) = COPY $f15_h
+  ; RV32IZFH-NEXT:   [[COPY6:%[0-9]+]]:_(s16) = COPY $f16_h
+  ; RV32IZFH-NEXT:   [[COPY7:%[0-9]+]]:_(s16) = COPY $f17_h
+  ; RV32IZFH-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32IZFH-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s32)
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[TRUNC]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: callee_half_return_stack2
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+  ; RV64I-NEXT:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x12
+  ; RV64I-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s64)
+  ; RV64I-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x13
+  ; RV64I-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s64)
+  ; RV64I-NEXT:   [[COPY4:%[0-9]+]]:_(s64) = COPY $x14
+  ; RV64I-NEXT:   [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s64)
+  ; RV64I-NEXT:   [[COPY5:%[0-9]+]]:_(s64) = COPY $x15
+  ; RV64I-NEXT:   [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s64)
+  ; RV64I-NEXT:   [[COPY6:%[0-9]+]]:_(s64) = COPY $x16
+  ; RV64I-NEXT:   [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY6]](s64)
+  ; RV64I-NEXT:   [[COPY7:%[0-9]+]]:_(s64) = COPY $x17
+  ; RV64I-NEXT:   [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY7]](s64)
+  ; RV64I-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; RV64I-NEXT:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s64) from %fixed-stack.0, align 16)
+  ; RV64I-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s64)
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC8]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: callee_half_return_stack2
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f, $f14_f, $f15_f, $f16_f, $f17_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV64IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f11_f
+  ; RV64IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV64IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f12_f
+  ; RV64IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV64IF-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $f13_f
+  ; RV64IF-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+  ; RV64IF-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $f14_f
+  ; RV64IF-NEXT:   [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32)
+  ; RV64IF-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $f15_f
+  ; RV64IF-NEXT:   [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32)
+  ; RV64IF-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $f16_f
+  ; RV64IF-NEXT:   [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY6]](s32)
+  ; RV64IF-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $f17_f
+  ; RV64IF-NEXT:   [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY7]](s32)
+  ; RV64IF-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IF-NEXT:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s64)
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC8]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: callee_half_return_stack2
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $x10, $f10_h, $f11_h, $f12_h, $f13_h, $f14_h, $f15_h, $f16_h, $f17_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+  ; RV64IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f12_h
+  ; RV64IZFH-NEXT:   [[COPY3:%[0-9]+]]:_(s16) = COPY $f13_h
+  ; RV64IZFH-NEXT:   [[COPY4:%[0-9]+]]:_(s16) = COPY $f14_h
+  ; RV64IZFH-NEXT:   [[COPY5:%[0-9]+]]:_(s16) = COPY $f15_h
+  ; RV64IZFH-NEXT:   [[COPY6:%[0-9]+]]:_(s16) = COPY $f16_h
+  ; RV64IZFH-NEXT:   [[COPY7:%[0-9]+]]:_(s16) = COPY $f17_h
+  ; RV64IZFH-NEXT:   [[COPY8:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64IZFH-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s64)
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[TRUNC]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  ret half %x
+}
+
+define half @caller_half_return_stack2(half %x, half %y) nounwind {
+  ; RV32I-LABEL: name: caller_half_return_stack2
+  ; RV32I: bb.1 (%ir-block.0):
+  ; RV32I-NEXT:   liveins: $x10, $x11
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32I-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+  ; RV32I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32I-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; RV32I-NEXT:   [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4200
+  ; RV32I-NEXT:   ADJCALLSTACKDOWN 4, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
+  ; RV32I-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[C1]](s16)
+  ; RV32I-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32I-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32I-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32I-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32I-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+  ; RV32I-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; RV32I-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY2]], [[C2]](s32)
+  ; RV32I-NEXT:   G_STORE [[ANYEXT8]](s32), [[PTR_ADD]](p0) :: (store (s32) into stack, align 16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT]](s32)
+  ; RV32I-NEXT:   $x11 = COPY [[ANYEXT1]](s32)
+  ; RV32I-NEXT:   $x12 = COPY [[ANYEXT2]](s32)
+  ; RV32I-NEXT:   $x13 = COPY [[ANYEXT3]](s32)
+  ; RV32I-NEXT:   $x14 = COPY [[ANYEXT4]](s32)
+  ; RV32I-NEXT:   $x15 = COPY [[ANYEXT5]](s32)
+  ; RV32I-NEXT:   $x16 = COPY [[ANYEXT6]](s32)
+  ; RV32I-NEXT:   $x17 = COPY [[ANYEXT7]](s32)
+  ; RV32I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit-def $x10
+  ; RV32I-NEXT:   ADJCALLSTACKUP 4, 0, implicit-def $x2, implicit $x2
+  ; RV32I-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+  ; RV32I-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+  ; RV32I-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV32I-NEXT:   $x10 = COPY [[ANYEXT9]](s32)
+  ; RV32I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV32IF-LABEL: name: caller_half_return_stack2
+  ; RV32IF: bb.1 (%ir-block.0):
+  ; RV32IF-NEXT:   liveins: $f10_f, $f11_f
+  ; RV32IF-NEXT: {{  $}}
+  ; RV32IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV32IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f11_f
+  ; RV32IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV32IF-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; RV32IF-NEXT:   [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4200
+  ; RV32IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[C1]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV32IF-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV32IF-NEXT:   $f11_f = COPY [[ANYEXT1]](s32)
+  ; RV32IF-NEXT:   $f12_f = COPY [[ANYEXT2]](s32)
+  ; RV32IF-NEXT:   $f13_f = COPY [[ANYEXT3]](s32)
+  ; RV32IF-NEXT:   $f14_f = COPY [[ANYEXT4]](s32)
+  ; RV32IF-NEXT:   $f15_f = COPY [[ANYEXT5]](s32)
+  ; RV32IF-NEXT:   $f16_f = COPY [[ANYEXT6]](s32)
+  ; RV32IF-NEXT:   $f17_f = COPY [[ANYEXT7]](s32)
+  ; RV32IF-NEXT:   $x10 = COPY [[ANYEXT8]](s32)
+  ; RV32IF-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_f, implicit $f11_f, implicit $f12_f, implicit $f13_f, implicit $f14_f, implicit $f15_f, implicit $f16_f, implicit $f17_f, implicit $x10, implicit-def $f10_f
+  ; RV32IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV32IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV32IF-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV32IF-NEXT:   $f10_f = COPY [[ANYEXT9]](s32)
+  ; RV32IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV32IZFH-LABEL: name: caller_half_return_stack2
+  ; RV32IZFH: bb.1 (%ir-block.0):
+  ; RV32IZFH-NEXT:   liveins: $f10_h, $f11_h
+  ; RV32IZFH-NEXT: {{  $}}
+  ; RV32IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+  ; RV32IZFH-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; RV32IZFH-NEXT:   [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4200
+  ; RV32IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV32IZFH-NEXT:   $f11_h = COPY [[C]](s16)
+  ; RV32IZFH-NEXT:   $f12_h = COPY [[COPY]](s16)
+  ; RV32IZFH-NEXT:   $f13_h = COPY [[C1]](s16)
+  ; RV32IZFH-NEXT:   $f14_h = COPY [[COPY]](s16)
+  ; RV32IZFH-NEXT:   $f15_h = COPY [[COPY1]](s16)
+  ; RV32IZFH-NEXT:   $f16_h = COPY [[COPY1]](s16)
+  ; RV32IZFH-NEXT:   $f17_h = COPY [[COPY1]](s16)
+  ; RV32IZFH-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
+  ; RV32IZFH-NEXT:   $x10 = COPY [[ANYEXT]](s32)
+  ; RV32IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_h, implicit $f11_h, implicit $f12_h, implicit $f13_h, implicit $f14_h, implicit $f15_h, implicit $f16_h, implicit $f17_h, implicit $x10, implicit-def $f10_h
+  ; RV32IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV32IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV32IZFH-NEXT:   $f10_h = COPY [[COPY2]](s16)
+  ; RV32IZFH-NEXT:   PseudoRET implicit $f10_h
+  ;
+  ; RV64I-LABEL: name: caller_half_return_stack2
+  ; RV64I: bb.1 (%ir-block.0):
+  ; RV64I-NEXT:   liveins: $x10, $x11
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+  ; RV64I-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+  ; RV64I-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+  ; RV64I-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; RV64I-NEXT:   [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4200
+  ; RV64I-NEXT:   ADJCALLSTACKDOWN 8, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64I-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s16)
+  ; RV64I-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64I-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s16)
+  ; RV64I-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64I-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64I-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64I-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64I-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64I-NEXT:   [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+  ; RV64I-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; RV64I-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY2]], [[C2]](s64)
+  ; RV64I-NEXT:   G_STORE [[ANYEXT8]](s64), [[PTR_ADD]](p0) :: (store (s64) into stack, align 16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64I-NEXT:   $x11 = COPY [[ANYEXT1]](s64)
+  ; RV64I-NEXT:   $x12 = COPY [[ANYEXT2]](s64)
+  ; RV64I-NEXT:   $x13 = COPY [[ANYEXT3]](s64)
+  ; RV64I-NEXT:   $x14 = COPY [[ANYEXT4]](s64)
+  ; RV64I-NEXT:   $x15 = COPY [[ANYEXT5]](s64)
+  ; RV64I-NEXT:   $x16 = COPY [[ANYEXT6]](s64)
+  ; RV64I-NEXT:   $x17 = COPY [[ANYEXT7]](s64)
+  ; RV64I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit-def $x10
+  ; RV64I-NEXT:   ADJCALLSTACKUP 8, 0, implicit-def $x2, implicit $x2
+  ; RV64I-NEXT:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+  ; RV64I-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s64)
+  ; RV64I-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV64I-NEXT:   $x10 = COPY [[ANYEXT9]](s64)
+  ; RV64I-NEXT:   PseudoRET implicit $x10
+  ;
+  ; RV64IF-LABEL: name: caller_half_return_stack2
+  ; RV64IF: bb.1 (%ir-block.0):
+  ; RV64IF-NEXT:   liveins: $f10_f, $f11_f
+  ; RV64IF-NEXT: {{  $}}
+  ; RV64IF-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; RV64IF-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $f11_f
+  ; RV64IF-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; RV64IF-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; RV64IF-NEXT:   [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4200
+  ; RV64IF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[C1]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC1]](s16)
+  ; RV64IF-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT]](s32)
+  ; RV64IF-NEXT:   $f11_f = COPY [[ANYEXT1]](s32)
+  ; RV64IF-NEXT:   $f12_f = COPY [[ANYEXT2]](s32)
+  ; RV64IF-NEXT:   $f13_f = COPY [[ANYEXT3]](s32)
+  ; RV64IF-NEXT:   $f14_f = COPY [[ANYEXT4]](s32)
+  ; RV64IF-NEXT:   $f15_f = COPY [[ANYEXT5]](s32)
+  ; RV64IF-NEXT:   $f16_f = COPY [[ANYEXT6]](s32)
+  ; RV64IF-NEXT:   $f17_f = COPY [[ANYEXT7]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT8]](s32)
+  ; RV64IF-NEXT:   $x10 = COPY [[ANYEXT9]](s64)
+  ; RV64IF-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_f, implicit $f11_f, implicit $f12_f, implicit $f13_f, implicit $f14_f, implicit $f15_f, implicit $f16_f, implicit $f17_f, implicit $x10, implicit-def $f10_f
+  ; RV64IF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IF-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $f10_f
+  ; RV64IF-NEXT:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; RV64IF-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC2]](s16)
+  ; RV64IF-NEXT:   $f10_f = COPY [[ANYEXT10]](s32)
+  ; RV64IF-NEXT:   PseudoRET implicit $f10_f
+  ;
+  ; RV64IZFH-LABEL: name: caller_half_return_stack2
+  ; RV64IZFH: bb.1 (%ir-block.0):
+  ; RV64IZFH-NEXT:   liveins: $f10_h, $f11_h
+  ; RV64IZFH-NEXT: {{  $}}
+  ; RV64IZFH-NEXT:   [[COPY:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   [[COPY1:%[0-9]+]]:_(s16) = COPY $f11_h
+  ; RV64IZFH-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; RV64IZFH-NEXT:   [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4200
+  ; RV64IZFH-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY]](s16)
+  ; RV64IZFH-NEXT:   $f11_h = COPY [[C]](s16)
+  ; RV64IZFH-NEXT:   $f12_h = COPY [[COPY]](s16)
+  ; RV64IZFH-NEXT:   $f13_h = COPY [[C1]](s16)
+  ; RV64IZFH-NEXT:   $f14_h = COPY [[COPY]](s16)
+  ; RV64IZFH-NEXT:   $f15_h = COPY [[COPY1]](s16)
+  ; RV64IZFH-NEXT:   $f16_h = COPY [[COPY1]](s16)
+  ; RV64IZFH-NEXT:   $f17_h = COPY [[COPY1]](s16)
+  ; RV64IZFH-NEXT:   [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s16)
+  ; RV64IZFH-NEXT:   $x10 = COPY [[ANYEXT]](s64)
+  ; RV64IZFH-NEXT:   PseudoCALL target-flags(riscv-call) @callee_half_return_stack2, csr_ilp32f_lp64f, implicit-def $x1, implicit $f10_h, implicit $f11_h, implicit $f12_h, implicit $f13_h, implicit $f14_h, implicit $f15_h, implicit $f16_h, implicit $f17_h, implicit $x10, implicit-def $f10_h
+  ; RV64IZFH-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+  ; RV64IZFH-NEXT:   [[COPY2:%[0-9]+]]:_(s16) = COPY $f10_h
+  ; RV64IZFH-NEXT:   $f10_h = COPY [[COPY2]](s16)
+  ; RV64IZFH-NEXT:   PseudoRET implicit $f10_h
+  %z = call half @callee_half_return_stack2(half %x, half 1.0, half %x, half 3.0, half %x, half %y, half %y, half %y, half %x)
+  ret half %z
+}
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index 5969aae..7642c0c 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -709,3 +709,146 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
   %res = call i128 @llvm.abs.i128(i128 %ashr, i1 true)
   ret i128 %res
 }
+
+define i8 @test_minsigned_i8(i8 %a0, i8 %a1) nounwind {
+; X64-LABEL: test_minsigned_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpb $-128, %dil
+; X64-NEXT:    jne .LBB17_1
+; X64-NEXT:  # %bb.2: # %select.end
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB17_1: # %select.false.sink
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    sarb $7, %al
+; X64-NEXT:    xorb %al, %dil
+; X64-NEXT:    subb %al, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_minsigned_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb $-128, %al
+; X86-NEXT:    jne .LBB17_1
+; X86-NEXT:  # %bb.2: # %select.end
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB17_1: # %select.false.sink
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarb $7, %cl
+; X86-NEXT:    xorb %cl, %al
+; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    retl
+  %lim = icmp eq i8 %a0, -128
+  %abs = tail call i8 @llvm.abs.i8(i8 %a0, i1 false)
+  %res = select i1 %lim, i8 %a1, i8 %abs
+  ret i8 %res
+}
+
+define i16 @test_minsigned_i16(i16 %a0, i16 %a1) nounwind {
+; X64-LABEL: test_minsigned_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    cmpl $32768, %eax # imm = 0x8000
+; X64-NEXT:    jne .LBB18_1
+; X64-NEXT:  # %bb.2: # %select.end
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB18_1: # %select.false.sink
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negw %ax
+; X64-NEXT:    cmovsw %di, %ax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_minsigned_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $32768, %ecx # imm = 0x8000
+; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:  # %bb.2: # %select.end
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB18_1: # %select.false.sink
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negw %ax
+; X86-NEXT:    cmovsw %cx, %ax
+; X86-NEXT:    retl
+  %lim = icmp eq i16 %a0, -32768
+  %abs = tail call i16 @llvm.abs.i16(i16 %a0, i1 false)
+  %res = select i1 %lim, i16 %a1, i16 %abs
+  ret i16 %res
+}
+
+define i32 @test_minsigned_i32(i32 %a0, i32 %a1) nounwind {
+; X64-LABEL: test_minsigned_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # %bb.2: # %select.end
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB19_1: # %select.false.sink
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_minsigned_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    jne .LBB19_1
+; X86-NEXT:  # %bb.2: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB19_1: # %select.false.sink
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    retl
+  %lim = icmp eq i32 %a0, -2147483648
+  %abs = tail call i32 @llvm.abs.i32(i32 %a0, i1 false)
+  %res = select i1 %lim, i32 %a1, i32 %abs
+  ret i32 %res
+}
+
+define i64 @test_minsigned_i64(i64 %a0, i64 %a1) nounwind {
+; X64-LABEL: test_minsigned_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-NEXT:    cmpq %rax, %rdi
+; X64-NEXT:    jne .LBB20_1
+; X64-NEXT:  # %bb.2: # %select.end
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB20_1: # %select.false.sink
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovsq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_minsigned_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal -2147483648(%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    jne .LBB20_1
+; X86-NEXT:  # %bb.2: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB20_1: # %select.false.sink
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    retl
+  %lim = icmp eq i64 %a0, -9223372036854775808
+  %abs = tail call i64 @llvm.abs.i64(i64 %a0, i1 false)
+  %res = select i1 %lim, i64 %a1, i64 %abs
+  ret i64 %res
+}
diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll
index 1094edd..fe79dfe 100644
--- a/llvm/test/CodeGen/X86/atomic-fp.ll
+++ b/llvm/test/CodeGen/X86/atomic-fp.ll
@@ -777,3 +777,2326 @@ bb:
   store atomic i64 %tmp9, ptr %tmp4 monotonic, align 8
   ret void
 }
+
+; ----- FSUB -----
+
+define dso_local void @fsub_32r(ptr %loc, float %val) nounwind {
+; X86-NOSSE-LABEL: fsub_32r:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl (%eax), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fsubs {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_32r:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl (%eax), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%eax)
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_32r:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, (%eax)
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_32r:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_32r:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    subss %xmm0, %xmm1
+; X64-SSE-NEXT:    movss %xmm1, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_32r:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
+  %1 = load atomic i32, ptr %loc seq_cst, align 4
+  %2 = bitcast i32 %1 to float
+  %sub = fsub float %2, %val
+  %3 = bitcast float %sub to i32
+  store atomic i32 %3, ptr %loc release, align 4
+  ret void
+}
+
+define dso_local void @fsub_64r(ptr %loc, double %val) nounwind {
+; X86-NOSSE-LABEL: fsub_64r:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    fildll (%eax)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fsubl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_64r:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fsubl 12(%ebp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_64r:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    subsd 12(%ebp), %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_64r:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    movl 8(%ebp), %eax
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vsubsd 12(%ebp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_64r:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT:    subsd %xmm0, %xmm1
+; X64-SSE-NEXT:    movsd %xmm1, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_64r:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
+  %1 = load atomic i64, ptr %loc seq_cst, align 8
+  %2 = bitcast i64 %1 to double
+  %sub = fsub double %2, %val
+  %3 = bitcast double %sub to i64
+  store atomic i64 %3, ptr %loc release, align 8
+  ret void
+}
+
+; Floating-point sub to a global using an immediate.
+define dso_local void @fsub_32g() nounwind {
+; X86-NOSSE-LABEL: fsub_32g:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl glob32, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fchs
+; X86-NOSSE-NEXT:    fadds (%esp)
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, glob32
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_32g:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl glob32, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, glob32
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_32g:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    addss glob32, %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, glob32
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_32g:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vaddss glob32, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, glob32
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_32g:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-SSE-NEXT:    addss glob32(%rip), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, glob32(%rip)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_32g:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vaddss glob32(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, glob32(%rip)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i32, ptr @glob32 monotonic, align 4
+  %f = bitcast i32 %i to float
+  %sub = fsub float %f, 1.000000e+00
+  %s = bitcast float %sub to i32
+  store atomic i32 %s, ptr @glob32 monotonic, align 4
+  ret void
+}
+
+define dso_local void @fsub_64g() nounwind {
+; X86-NOSSE-LABEL: fsub_64g:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    fildll glob64
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fchs
+; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_64g:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fld1
+; X86-SSE1-NEXT:    fchs
+; X86-SSE1-NEXT:    faddl (%esp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_64g:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, glob64
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_64g:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, glob64
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_64g:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0]
+; X64-SSE-NEXT:    addsd glob64(%rip), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, glob64(%rip)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_64g:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vaddsd glob64(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, glob64(%rip)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i64, ptr @glob64 monotonic, align 8
+  %f = bitcast i64 %i to double
+  %sub = fsub double %f, 1.000000e+00
+  %s = bitcast double %sub to i64
+  store atomic i64 %s, ptr @glob64 monotonic, align 8
+  ret void
+}
+
+; Floating-point sub to a hard-coded immediate location using an immediate.
+define dso_local void @fsub_32imm() nounwind {
+; X86-NOSSE-LABEL: fsub_32imm:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl -559038737, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fchs
+; X86-NOSSE-NEXT:    fadds (%esp)
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, -559038737
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_32imm:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl -559038737, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, -559038737
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_32imm:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    addss -559038737, %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, -559038737
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_32imm:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vaddss -559038737, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, -559038737
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_32imm:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-SSE-NEXT:    addss (%rax), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_32imm:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vaddss (%rax), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, (%rax)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i32, ptr inttoptr (i32 3735928559 to ptr) monotonic, align 4
+  %f = bitcast i32 %i to float
+  %sub = fsub float %f, 1.000000e+00
+  %s = bitcast float %sub to i32
+  store atomic i32 %s, ptr inttoptr (i32 3735928559 to ptr) monotonic, align 4
+  ret void
+}
+
+define dso_local void @fsub_64imm() nounwind {
+; X86-NOSSE-LABEL: fsub_64imm:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    fildll -559038737
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fchs
+; X86-NOSSE-NEXT:    faddl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_64imm:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fld1
+; X86-SSE1-NEXT:    fchs
+; X86-SSE1-NEXT:    faddl (%esp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_64imm:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, -559038737
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_64imm:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, -559038737
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_64imm:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0]
+; X64-SSE-NEXT:    addsd (%rax), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_64imm:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vaddsd (%rax), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rax)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i64, ptr inttoptr (i64 3735928559 to ptr) monotonic, align 8
+  %f = bitcast i64 %i to double
+  %sub = fsub double %f, 1.000000e+00
+  %s = bitcast double %sub to i64
+  store atomic i64 %s, ptr inttoptr (i64 3735928559 to ptr) monotonic, align 8
+  ret void
+}
+
+; Floating-point sub to a stack location.
+define dso_local void @fsub_32stack() nounwind {
+; X86-NOSSE-LABEL: fsub_32stack:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $12, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fsubs (%esp)
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_32stack:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $12, %esp
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE1-NEXT:    subss (%esp), %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_32stack:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    subss (%esp), %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    popl %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_32stack:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vsubss (%esp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_32stack:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-SSE-NEXT:    subss -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_32stack:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vsubss -{{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    retq
+  %ptr = alloca i32, align 4
+  %load = load atomic i32, ptr %ptr acquire, align 4
+  %bc0 = bitcast i32 %load to float
+  %fsub = fsub float 1.000000e+00, %bc0
+  %bc1 = bitcast float %fsub to i32
+  store atomic i32 %bc1, ptr %ptr release, align 4
+  ret void
+}
+
+define dso_local void @fsub_64stack() nounwind {
+; X86-NOSSE-LABEL: fsub_64stack:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fsubl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_64stack:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $24, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fld1
+; X86-SSE1-NEXT:    fsubl (%esp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_64stack:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    subsd %xmm0, %xmm1
+; X86-SSE2-NEXT:    movsd %xmm1, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_64stack:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $16, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_64stack:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; X64-SSE-NEXT:    subsd -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_64stack:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vsubsd -{{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    retq
+  %ptr = alloca i64, align 8
+  %load = load atomic i64, ptr %ptr acquire, align 8
+  %bc0 = bitcast i64 %load to double
+  %fsub = fsub double 1.000000e+00, %bc0
+  %bc1 = bitcast double %fsub to i64
+  store atomic i64 %bc1, ptr %ptr release, align 8
+  ret void
+}
+
+define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
+; X86-NOSSE-LABEL: fsub_array:
+; X86-NOSSE:       # %bb.0: # %bb
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
+; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fsubl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %edx, (%esp)
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fsub_array:
+; X86-SSE1:       # %bb.0: # %bb
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 20(%ebp), %eax
+; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fsubl 12(%ebp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fsub_array:
+; X86-SSE2:       # %bb.0: # %bb
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movl 20(%ebp), %eax
+; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    subsd 12(%ebp), %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fsub_array:
+; X86-AVX:       # %bb.0: # %bb
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    movl 20(%ebp), %eax
+; X86-AVX-NEXT:    movl 8(%ebp), %ecx
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vsubsd 12(%ebp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, (%ecx,%eax,8)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fsub_array:
+; X64-SSE:       # %bb.0: # %bb
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT:    subsd %xmm0, %xmm1
+; X64-SSE-NEXT:    movsd %xmm1, (%rdi,%rsi,8)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fsub_array:
+; X64-AVX:       # %bb.0: # %bb
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi,%rsi,8)
+; X64-AVX-NEXT:    retq
+bb:
+  %tmp4 = getelementptr inbounds i64, ptr %arg, i64 %arg2
+  %tmp6 = load atomic i64, ptr %tmp4 monotonic, align 8
+  %tmp7 = bitcast i64 %tmp6 to double
+  %tmp8 = fsub double %tmp7, %arg1
+  %tmp9 = bitcast double %tmp8 to i64
+  store atomic i64 %tmp9, ptr %tmp4 monotonic, align 8
+  ret void
+}
+
+; ----- FMUL -----
+
+define dso_local void @fmul_32r(ptr %loc, float %val) nounwind {
+; X86-NOSSE-LABEL: fmul_32r:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl (%eax), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fmuls {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_32r:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl (%eax), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    mulss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%eax)
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_32r:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    mulss (%eax), %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, (%eax)
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_32r:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmulss (%eax), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_32r:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    mulss (%rdi), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_32r:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmulss (%rdi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
+  %1 = load atomic i32, ptr %loc seq_cst, align 4
+  %2 = bitcast i32 %1 to float
+  %mul = fmul float %2, %val
+  %3 = bitcast float %mul to i32
+  store atomic i32 %3, ptr %loc release, align 4
+  ret void
+}
+
+define dso_local void @fmul_64r(ptr %loc, double %val) nounwind {
+; X86-NOSSE-LABEL: fmul_64r:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    fildll (%eax)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fmull 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_64r:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fmull 12(%ebp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_64r:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    mulsd 12(%ebp), %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_64r:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    movl 8(%ebp), %eax
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmulsd 12(%ebp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_64r:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    mulsd (%rdi), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_64r:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
+  %1 = load atomic i64, ptr %loc seq_cst, align 8
+  %2 = bitcast i64 %1 to double
+  %mul = fmul double %2, %val
+  %3 = bitcast double %mul to i64
+  store atomic i64 %3, ptr %loc release, align 8
+  ret void
+}
+
+; Floating-point mul to a global using an immediate.
+define dso_local void @fmul_32g() nounwind {
+; X86-NOSSE-LABEL: fmul_32g:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl glob32, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, glob32
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_32g:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl glob32, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, glob32
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_32g:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    mulss glob32, %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, glob32
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_32g:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vmulss glob32, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, glob32
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_32g:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-SSE-NEXT:    mulss glob32(%rip), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, glob32(%rip)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_32g:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vmulss glob32(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, glob32(%rip)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i32, ptr @glob32 monotonic, align 4
+  %f = bitcast i32 %i to float
+  %mul = fmul float %f, 0x400921FA00000000
+  %s = bitcast float %mul to i32
+  store atomic i32 %s, ptr @glob32 monotonic, align 4
+  ret void
+}
+
+define dso_local void @fmul_64g() nounwind {
+; X86-NOSSE-LABEL: fmul_64g:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    fildll glob64
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_64g:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_64g:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, glob64
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_64g:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, glob64
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_64g:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.1415901184082031E+0,0.0E+0]
+; X64-SSE-NEXT:    mulsd glob64(%rip), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, glob64(%rip)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_64g:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.1415901184082031E+0,0.0E+0]
+; X64-AVX-NEXT:    vmulsd glob64(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, glob64(%rip)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i64, ptr @glob64 monotonic, align 8
+  %f = bitcast i64 %i to double
+  %mul = fmul double %f, 0x400921FA00000000
+  %s = bitcast double %mul to i64
+  store atomic i64 %s, ptr @glob64 monotonic, align 8
+  ret void
+}
+
+; Floating-point mul to a hard-coded immediate location using an immediate.
+define dso_local void @fmul_32imm() nounwind {
+; X86-NOSSE-LABEL: fmul_32imm:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl -559038737, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, -559038737
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_32imm:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl -559038737, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, -559038737
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_32imm:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    mulss -559038737, %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, -559038737
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_32imm:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vmulss -559038737, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, -559038737
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_32imm:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-SSE-NEXT:    mulss (%rax), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_32imm:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vmulss (%rax), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, (%rax)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i32, ptr inttoptr (i32 3735928559 to ptr) monotonic, align 4
+  %f = bitcast i32 %i to float
+  %mul = fmul float %f, 0x400921FA00000000
+  %s = bitcast float %mul to i32
+  store atomic i32 %s, ptr inttoptr (i32 3735928559 to ptr) monotonic, align 4
+  ret void
+}
+
+define dso_local void @fmul_64imm() nounwind {
+; X86-NOSSE-LABEL: fmul_64imm:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    fildll -559038737
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_64imm:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_64imm:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, -559038737
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_64imm:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, -559038737
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_64imm:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.1415901184082031E+0,0.0E+0]
+; X64-SSE-NEXT:    mulsd (%rax), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_64imm:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.1415901184082031E+0,0.0E+0]
+; X64-AVX-NEXT:    vmulsd (%rax), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rax)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i64, ptr inttoptr (i64 3735928559 to ptr) monotonic, align 8
+  %f = bitcast i64 %i to double
+  %mul = fmul double %f, 0x400921FA00000000
+  %s = bitcast double %mul to i64
+  store atomic i64 %s, ptr inttoptr (i64 3735928559 to ptr) monotonic, align 8
+  ret void
+}
+
+; Floating-point mul to a stack location.
+define dso_local void @fmul_32stack() nounwind {
+; X86-NOSSE-LABEL: fmul_32stack:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $12, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_32stack:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $12, %esp
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_32stack:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    mulss (%esp), %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    popl %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_32stack:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vmulss (%esp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_32stack:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-SSE-NEXT:    mulss -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_32stack:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.14159012E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vmulss -{{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    retq
+  %ptr = alloca i32, align 4
+  %load = load atomic i32, ptr %ptr acquire, align 4
+  %bc0 = bitcast i32 %load to float
+  %fmul = fmul float 0x400921FA00000000, %bc0
+  %bc1 = bitcast float %fmul to i32
+  store atomic i32 %bc1, ptr %ptr release, align 4
+  ret void
+}
+
+define dso_local void @fmul_64stack() nounwind {
+; X86-NOSSE-LABEL: fmul_64stack:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_64stack:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $24, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_64stack:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_64stack:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $16, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_64stack:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.1415901184082031E+0,0.0E+0]
+; X64-SSE-NEXT:    mulsd -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_64stack:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.1415901184082031E+0,0.0E+0]
+; X64-AVX-NEXT:    vmulsd -{{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    retq
+  %ptr = alloca i64, align 8
+  %load = load atomic i64, ptr %ptr acquire, align 8
+  %bc0 = bitcast i64 %load to double
+  %fmul = fmul double 0x400921FA00000000, %bc0
+  %bc1 = bitcast double %fmul to i64
+  store atomic i64 %bc1, ptr %ptr release, align 8
+  ret void
+}
+
+define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
+; X86-NOSSE-LABEL: fmul_array:
+; X86-NOSSE:       # %bb.0: # %bb
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
+; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fmull 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %edx, (%esp)
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fmul_array:
+; X86-SSE1:       # %bb.0: # %bb
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 20(%ebp), %eax
+; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fmull 12(%ebp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fmul_array:
+; X86-SSE2:       # %bb.0: # %bb
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movl 20(%ebp), %eax
+; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    mulsd 12(%ebp), %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fmul_array:
+; X86-AVX:       # %bb.0: # %bb
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    movl 20(%ebp), %eax
+; X86-AVX-NEXT:    movl 8(%ebp), %ecx
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmulsd 12(%ebp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, (%ecx,%eax,8)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fmul_array:
+; X64-SSE:       # %bb.0: # %bb
+; X64-SSE-NEXT:    mulsd (%rdi,%rsi,8), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, (%rdi,%rsi,8)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fmul_array:
+; X64-AVX:       # %bb.0: # %bb
+; X64-AVX-NEXT:    vmulsd (%rdi,%rsi,8), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi,%rsi,8)
+; X64-AVX-NEXT:    retq
+bb:
+  %tmp4 = getelementptr inbounds i64, ptr %arg, i64 %arg2
+  %tmp6 = load atomic i64, ptr %tmp4 monotonic, align 8
+  %tmp7 = bitcast i64 %tmp6 to double
+  %tmp8 = fmul double %tmp7, %arg1
+  %tmp9 = bitcast double %tmp8 to i64
+  store atomic i64 %tmp9, ptr %tmp4 monotonic, align 8
+  ret void
+}
+
+; ----- FDIV -----
+
+define dso_local void @fdiv_32r(ptr %loc, float %val) nounwind {
+; X86-NOSSE-LABEL: fdiv_32r:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl (%eax), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fdivs {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_32r:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl (%eax), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    divss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%eax)
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_32r:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    divss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, (%eax)
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_32r:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_32r:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    divss %xmm0, %xmm1
+; X64-SSE-NEXT:    movss %xmm1, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_32r:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
+  %1 = load atomic i32, ptr %loc seq_cst, align 4
+  %2 = bitcast i32 %1 to float
+  %div = fdiv float %2, %val
+  %3 = bitcast float %div to i32
+  store atomic i32 %3, ptr %loc release, align 4
+  ret void
+}
+
+define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind {
+; X86-NOSSE-LABEL: fdiv_64r:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    fildll (%eax)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fdivl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%eax)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_64r:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fdivl 12(%ebp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_64r:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    divsd 12(%ebp), %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_64r:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    movl 8(%ebp), %eax
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vdivsd 12(%ebp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_64r:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT:    divsd %xmm0, %xmm1
+; X64-SSE-NEXT:    movsd %xmm1, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_64r:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
+  %1 = load atomic i64, ptr %loc seq_cst, align 8
+  %2 = bitcast i64 %1 to double
+  %div = fdiv double %2, %val
+  %3 = bitcast double %div to i64
+  store atomic i64 %3, ptr %loc release, align 8
+  ret void
+}
+
+; Floating-point div to a global using an immediate.
+define dso_local void @fdiv_32g() nounwind {
+; X86-NOSSE-LABEL: fdiv_32g:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl glob32, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, glob32
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_32g:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl glob32, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, glob32
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_32g:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, glob32
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_32g:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vdivss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, glob32
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_32g:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, glob32(%rip)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_32g:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vdivss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, glob32(%rip)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i32, ptr @glob32 monotonic, align 4
+  %f = bitcast i32 %i to float
+  %div = fdiv float %f, 0x400921FA00000000
+  %s = bitcast float %div to i32
+  store atomic i32 %s, ptr @glob32 monotonic, align 4
+  ret void
+}
+
+define dso_local void @fdiv_64g() nounwind {
+; X86-NOSSE-LABEL: fdiv_64g:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    fildll glob64
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll glob64
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_64g:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, glob64
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_64g:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, glob64
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_64g:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, glob64
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_64g:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, glob64(%rip)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_64g:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, glob64(%rip)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i64, ptr @glob64 monotonic, align 8
+  %f = bitcast i64 %i to double
+  %div = fdiv double %f, 0x400921FA00000000
+  %s = bitcast double %div to i64
+  store atomic i64 %s, ptr @glob64 monotonic, align 8
+  ret void
+}
+
+; Floating-point div to a hard-coded immediate location using an immediate.
+define dso_local void @fdiv_32imm() nounwind {
+; X86-NOSSE-LABEL: fdiv_32imm:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    movl -559038737, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, -559038737
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_32imm:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    movl -559038737, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, -559038737
+; X86-SSE1-NEXT:    addl $8, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_32imm:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, -559038737
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_32imm:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vdivss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, -559038737
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_32imm:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_32imm:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vdivss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, (%rax)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i32, ptr inttoptr (i32 3735928559 to ptr) monotonic, align 4
+  %f = bitcast i32 %i to float
+  %div = fdiv float %f, 0x400921FA00000000
+  %s = bitcast float %div to i32
+  store atomic i32 %s, ptr inttoptr (i32 3735928559 to ptr) monotonic, align 4
+  ret void
+}
+
+define dso_local void @fdiv_64imm() nounwind {
+; X86-NOSSE-LABEL: fdiv_64imm:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $32, %esp
+; X86-NOSSE-NEXT:    fildll -559038737
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll -559038737
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_64imm:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fdivs {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, -559038737
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_64imm:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, -559038737
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_64imm:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, -559038737
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_64imm:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_64imm:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rax)
+; X64-AVX-NEXT:    retq
+  %i = load atomic i64, ptr inttoptr (i64 3735928559 to ptr) monotonic, align 8
+  %f = bitcast i64 %i to double
+  %div = fdiv double %f, 0x400921FA00000000
+  %s = bitcast double %div to i64
+  store atomic i64 %s, ptr inttoptr (i64 3735928559 to ptr) monotonic, align 8
+  ret void
+}
+
+; Floating-point div to a stack location.
+define dso_local void @fdiv_32stack() nounwind {
+; X86-NOSSE-LABEL: fdiv_32stack:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $12, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fdivs (%esp)
+; X86-NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_32stack:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    subl $12, %esp
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE1-NEXT:    divss (%esp), %xmm0
+; X86-SSE1-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_32stack:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    divss (%esp), %xmm0
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    popl %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_32stack:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vdivss (%esp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_32stack:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-SSE-NEXT:    divss -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    movss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_32stack:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vdivss -{{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    retq
+  %ptr = alloca i32, align 4
+  %load = load atomic i32, ptr %ptr acquire, align 4
+  %bc0 = bitcast i32 %load to float
+  %fdiv = fdiv float 1.000000e+00, %bc0
+  %bc1 = bitcast float %fdiv to i32
+  store atomic i32 %bc1, ptr %ptr release, align 4
+  ret void
+}
+
+define dso_local void @fdiv_64stack() nounwind {
+; X86-NOSSE-LABEL: fdiv_64stack:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fld1
+; X86-NOSSE-NEXT:    fdivl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_64stack:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $24, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fld1
+; X86-SSE1-NEXT:    fdivl (%esp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_64stack:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; X86-SSE2-NEXT:    divsd %xmm0, %xmm1
+; X86-SSE2-NEXT:    movsd %xmm1, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_64stack:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $16, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; X86-AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_64stack:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; X64-SSE-NEXT:    divsd -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_64stack:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; X64-AVX-NEXT:    vdivsd -{{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    retq
+  %ptr = alloca i64, align 8
+  %load = load atomic i64, ptr %ptr acquire, align 8
+  %bc0 = bitcast i64 %load to double
+  %fdiv = fdiv double 1.000000e+00, %bc0
+  %bc1 = bitcast double %fdiv to i64
+  store atomic i64 %bc1, ptr %ptr release, align 8
+  ret void
+}
+
+define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
+; X86-NOSSE-LABEL: fdiv_array:
+; X86-NOSSE:       # %bb.0: # %bb
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $40, %esp
+; X86-NOSSE-NEXT:    movl 20(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
+; X86-NOSSE-NEXT:    fildll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fdivl 12(%ebp)
+; X86-NOSSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl %edx, (%esp)
+; X86-NOSSE-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fildll (%esp)
+; X86-NOSSE-NEXT:    fistpll (%ecx,%eax,8)
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: fdiv_array:
+; X86-SSE1:       # %bb.0: # %bb
+; X86-SSE1-NEXT:    pushl %ebp
+; X86-SSE1-NEXT:    movl %esp, %ebp
+; X86-SSE1-NEXT:    andl $-8, %esp
+; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    movl 20(%ebp), %eax
+; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-SSE1-NEXT:    movss %xmm1, (%esp)
+; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE1-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    fldl (%esp)
+; X86-SSE1-NEXT:    fdivl 12(%ebp)
+; X86-SSE1-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE1-NEXT:    movl %ebp, %esp
+; X86-SSE1-NEXT:    popl %ebp
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: fdiv_array:
+; X86-SSE2:       # %bb.0: # %bb
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movl 20(%ebp), %eax
+; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    divsd 12(%ebp), %xmm0
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movlps %xmm0, (%ecx,%eax,8)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: fdiv_array:
+; X86-AVX:       # %bb.0: # %bb
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    movl 20(%ebp), %eax
+; X86-AVX-NEXT:    movl 8(%ebp), %ecx
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vdivsd 12(%ebp), %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovlps %xmm0, (%ecx,%eax,8)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: fdiv_array:
+; X64-SSE:       # %bb.0: # %bb
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE-NEXT:    divsd %xmm0, %xmm1
+; X64-SSE-NEXT:    movsd %xmm1, (%rdi,%rsi,8)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: fdiv_array:
+; X64-AVX:       # %bb.0: # %bb
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi,%rsi,8)
+; X64-AVX-NEXT:    retq
+bb:
+  %tmp4 = getelementptr inbounds i64, ptr %arg, i64 %arg2
+  %tmp6 = load atomic i64, ptr %tmp4 monotonic, align 8
+  %tmp7 = bitcast i64 %tmp6 to double
+  %tmp8 = fdiv double %tmp7, %arg1
+  %tmp9 = bitcast double %tmp8 to i64
+  store atomic i64 %tmp9, ptr %tmp4 monotonic, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
index 47ac884..ac65a11 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
@@ -1367,7 +1367,7 @@ define void @bcast_unfold_fdiv_v16f32(ptr nocapture %arg) {
 ; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
-; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB42_1: # %bb1
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1386,7 +1386,7 @@ bb1:                                              ; preds = %bb1, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
   %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
   %tmp4 = load <16 x float>, ptr %tmp2, align 4
-  %tmp5 = fdiv <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %tmp5 = fdiv <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
   store <16 x float> %tmp5, ptr %tmp2, align 4
   %tmp7 = add i64 %tmp, 16
   %tmp8 = icmp eq i64 %tmp7, 1024
@@ -1400,7 +1400,7 @@ define void @bcast_unfold_fdiv_v8f32(ptr nocapture %arg) {
 ; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
-; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB43_1: # %bb1
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1419,7 +1419,7 @@ bb1:                                              ; preds = %bb1, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
   %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
   %tmp4 = load <8 x float>, ptr %tmp2, align 4
-  %tmp5 = fdiv <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %tmp5 = fdiv <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
   store <8 x float> %tmp5, ptr %tmp2, align 4
   %tmp7 = add i64 %tmp, 8
   %tmp8 = icmp eq i64 %tmp7, 1024
@@ -1433,7 +1433,7 @@ define void @bcast_unfold_fdiv_v4f32(ptr nocapture %arg) {
 ; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
-; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB44_1: # %bb1
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1451,7 +1451,7 @@ bb1:                                              ; preds = %bb1, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
   %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
   %tmp4 = load <4 x float>, ptr %tmp2, align 4
-  %tmp5 = fdiv <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %tmp5 = fdiv <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
   store <4 x float> %tmp5, ptr %tmp2, align 4
   %tmp7 = add i64 %tmp, 4
   %tmp8 = icmp eq i64 %tmp7, 1024
@@ -1465,7 +1465,7 @@ define void @bcast_unfold_fdiv_v8f64(ptr nocapture %arg) {
 ; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
-; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB45_1: # %bb1
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1484,7 +1484,7 @@ bb1:                                              ; preds = %bb1, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
   %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
   %tmp4 = load <8 x double>, ptr %tmp2, align 8
-  %tmp5 = fdiv <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
+  %tmp5 = fdiv <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
   store <8 x double> %tmp5, ptr %tmp2, align 8
   %tmp7 = add i64 %tmp, 8
   %tmp8 = icmp eq i64 %tmp7, 1024
@@ -1498,7 +1498,7 @@ define void @bcast_unfold_fdiv_v4f64(ptr nocapture %arg) {
 ; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
-; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB46_1: # %bb1
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1517,7 +1517,7 @@ bb1:                                              ; preds = %bb1, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
   %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
   %tmp4 = load <4 x double>, ptr %tmp2, align 8
-  %tmp5 = fdiv <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
+  %tmp5 = fdiv <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
   store <4 x double> %tmp5, ptr %tmp2, align 8
   %tmp7 = add i64 %tmp, 4
   %tmp8 = icmp eq i64 %tmp7, 1024
@@ -1531,7 +1531,7 @@ define void @bcast_unfold_fdiv_v2f64(ptr nocapture %arg) {
 ; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
 ; CHECK-NEXT:    # xmm0 = mem[0,0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB47_1: # %bb1
@@ -1550,7 +1550,7 @@ bb1:                                              ; preds = %bb1, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
   %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
   %tmp4 = load <2 x double>, ptr %tmp2, align 8
-  %tmp5 = fdiv <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
+  %tmp5 = fdiv <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
   store <2 x double> %tmp5, ptr %tmp2, align 8
   %tmp7 = add i64 %tmp, 2
   %tmp8 = icmp eq i64 %tmp7, 1024
diff --git a/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll
index 33a7ec9..ba09ba8 100644
--- a/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll
+++ b/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll
@@ -14,7 +14,7 @@
 define double @unsafe_fp_math_default0(double %x) {
 ; SAFE:      divsd
 ; UNSAFE:    mulsd
-  %div = fdiv double %x, 2.0
+  %div = fdiv double %x, 3.0
   ret double %div
 }
 
@@ -22,7 +22,7 @@ define double @unsafe_fp_math_default0(double %x) {
 define double @unsafe_fp_math_off(double %x) #0 {
 ; SAFE:      divsd
 ; UNSAFE:    divsd
-  %div = fdiv double %x, 2.0
+  %div = fdiv double %x, 3.0
   ret double %div
 }
 
@@ -31,7 +31,7 @@ define double @unsafe_fp_math_default1(double %x) {
 ; With unsafe math enabled, can change this div to a mul.
 ; SAFE:      divsd
 ; UNSAFE:    mulsd
-  %div = fdiv double %x, 2.0
+  %div = fdiv double %x, 3.0
   ret double %div
 }
 
@@ -39,7 +39,7 @@ define double @unsafe_fp_math_default1(double %x) {
 define double @unsafe_fp_math_on(double %x) #1 {
 ; SAFE:      mulsd
 ; UNSAFE:    mulsd
-  %div = fdiv double %x, 2.0
+  %div = fdiv double %x, 3.0
   ret double %div
 }
 
@@ -48,7 +48,7 @@ define double @unsafe_fp_math_default2(double %x) {
 ; With unsafe math enabled, can change this div to a mul.
 ; SAFE:      divsd
 ; UNSAFE:    mulsd
-  %div = fdiv double %x, 2.0
+  %div = fdiv double %x, 3.0
   ret double %div
 }
 
diff --git a/llvm/test/CodeGen/X86/pr94824.ll b/llvm/test/CodeGen/X86/pr94824.ll
new file mode 100644
index 0000000..7744d00
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr94824.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define i16 @pr94824(i8 %x1) {
+; CHECK-LABEL: pr94824:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    orl $256, %edi # imm = 0x100
+; CHECK-NEXT:    rep bsfl %edi, %ecx
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %cttz = call i8 @llvm.cttz.i8(i8 %x1, i1 false)
+  %ext = zext i8 %cttz to i16
+  %shl = shl i16 1, %ext
+  ret i16 %shl
+}
diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll
index 21c8e8d..912ff75 100644
--- a/llvm/test/CodeGen/X86/vshift-6.ll
+++ b/llvm/test/CodeGen/X86/vshift-6.ll
@@ -32,9 +32,9 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) {
 ; X86-NEXT:    movb %al, (%ecx)
 ; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psllq $56, %xmm1
-; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    psllw $5, %xmm1
+; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
 ; X86-NEXT:    pxor %xmm0, %xmm0
 ; X86-NEXT:    pcmpgtb %xmm1, %xmm0
@@ -64,9 +64,9 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) {
 ; X64-NEXT:    movb %r9b, (%rdi)
 ; X64-NEXT:    movd %r9d, %xmm1
 ; X64-NEXT:    psllq $56, %xmm1
-; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pcmpeqd %xmm2, %xmm2
 ; X64-NEXT:    psllw $5, %xmm1
+; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pxor %xmm3, %xmm3
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtb %xmm1, %xmm0
diff --git a/llvm/test/DebugInfo/debug_frame_symbol.ll b/llvm/test/DebugInfo/debug_frame_symbol.ll
index fed080c..56ac55e 100644
--- a/llvm/test/DebugInfo/debug_frame_symbol.ll
+++ b/llvm/test/DebugInfo/debug_frame_symbol.ll
@@ -22,7 +22,7 @@ entry:
   ret void, !dbg !12
 }
 
-attributes #0 = { noinline nounwind optnone ssp "frame-pointer"="non-leaf" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+zcm,+zcz" }
+attributes #0 = { noinline nounwind optnone ssp "frame-pointer"="non-leaf" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2, !3, !4, !5, !6}
diff --git a/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test b/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test
index 9e46570..5141ff6 100644
--- a/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test
+++ b/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test
@@ -1,3 +1,4 @@
+# REQUIRES: shell
 # Ensure that no realpath assumptions are made about .gnu_debuglink paths.
 
 # Copy inputs to some other location with arbitrary names, with the original
diff --git a/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll b/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll
index 3d67778..64fcfdc 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll
@@ -24,5 +24,5 @@ entry:
   store i8 2, ptr %arrayidx1, align 1
   ret void
 }
-attributes #0 = { noinline nounwind optnone sanitize_address ssp uwtable(sync) "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+zcm,+zcz" }
+attributes #0 = { noinline nounwind optnone sanitize_address ssp uwtable(sync) "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll b/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll
index fa49110..90e1ab3 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll
@@ -51,5 +51,5 @@ entry:
 ; CHECK-NOT:  store i64 -723401728380766731, ptr %126, align 1
   ret void
 }
-attributes #0 = { noinline nounwind optnone sanitize_address ssp uwtable(sync) "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+zcm,+zcz" }
+attributes #0 = { noinline nounwind optnone sanitize_address ssp uwtable(sync) "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" }
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
index b33d484..d031e04 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes
 ; RUN: opt %s -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -287,7 +287,7 @@ define <4 x float> @test_x86_avx_cvt_pd2_ps_256(<4 x double> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> [[A0:%.*]])
@@ -308,7 +308,7 @@ define <4 x i32> @test_x86_avx_cvt_pd2dq_256(<4 x double> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> [[A0:%.*]])
@@ -329,7 +329,7 @@ define <8 x i32> @test_x86_avx_cvt_ps2dq_256(<8 x float> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> [[A0:%.*]])
@@ -350,7 +350,7 @@ define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> [[A0:%.*]])
@@ -371,7 +371,7 @@ define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> [[A0:%.*]])
@@ -485,7 +485,7 @@ define <32 x i8> @test_x86_avx_ldu_dq_256(ptr %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr [[A0]])
@@ -509,7 +509,7 @@ define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 {
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
 ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       5:
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]])
@@ -533,7 +533,7 @@ define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 {
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
 ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       5:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]])
@@ -557,7 +557,7 @@ define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 {
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
 ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       5:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]])
@@ -581,7 +581,7 @@ define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 {
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
 ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       5:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]])
@@ -609,7 +609,7 @@ define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a
 ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
 ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]])
@@ -636,7 +636,7 @@ define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double
 ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
 ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]])
@@ -663,7 +663,7 @@ define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2
 ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
 ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]])
@@ -690,7 +690,7 @@ define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float>
 ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
 ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]])
@@ -774,7 +774,7 @@ define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> [[A0:%.*]])
@@ -795,7 +795,7 @@ define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> [[A0:%.*]])
@@ -887,7 +887,7 @@ define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> [[A0:%.*]], i32 7)
@@ -908,7 +908,7 @@ define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> [[A0:%.*]], i32 7)
@@ -946,7 +946,7 @@ define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1)
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0:%.*]], <2 x i64> [[A1:%.*]])
@@ -971,7 +971,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64>
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0:%.*]], <4 x i64> [[A1:%.*]])
@@ -991,7 +991,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0:%.*]], <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
@@ -1014,7 +1014,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0:%.*]], <4 x i32> [[A1:%.*]])
@@ -1032,7 +1032,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[A2:%.*]] = load <4 x i32>, ptr [[A1:%.*]], align 16
@@ -1047,7 +1047,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
 ; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0:%.*]], <4 x i32> [[A2]])
@@ -1073,7 +1073,7 @@ define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -1349,7 +1349,7 @@ define void @movnt_dq(ptr %p, <2 x i64> %a1) nounwind #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -1374,7 +1374,7 @@ define void @movnt_ps(ptr %p, <8 x float> %a) nounwind #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P:%.*]] to i64
@@ -1400,7 +1400,7 @@ define void @movnt_pd(ptr %p, <4 x double> %a1) nounwind #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P:%.*]] to i64
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll
index 863a2c8..e6775cf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes
 ; RUN: opt %s -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -94,7 +94,7 @@ define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) #0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 17)
@@ -120,7 +120,7 @@ define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]], i8 7)
@@ -140,7 +140,7 @@ define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(ptr %ptr, <16 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[A0:%.*]] = load <16 x i8>, ptr [[PTR:%.*]], align 16
@@ -155,7 +155,7 @@ define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(ptr %ptr, <16 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
 ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
 ; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> [[A0]], <16 x i8> [[A1:%.*]], i8 7)
@@ -297,7 +297,7 @@ define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> [[A0:%.*]], i32 7)
@@ -318,7 +318,7 @@ define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[A0:%.*]], i32 7)
@@ -355,7 +355,7 @@ define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[A1B:%.*]] = load <2 x double>, ptr [[A1:%.*]], align 16
@@ -382,7 +382,7 @@ define <4 x float> @test_x86_sse41_round_ss_load(<4 x float> %a0, ptr %a1) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[A1B:%.*]] = load <4 x float>, ptr [[A1:%.*]], align 16
diff --git a/llvm/test/LTO/X86/triple-init2.ll b/llvm/test/LTO/X86/triple-init2.ll
index 2638180..bc5ecf9 100644
--- a/llvm/test/LTO/X86/triple-init2.ll
+++ b/llvm/test/LTO/X86/triple-init2.ll
@@ -11,21 +11,20 @@
 ; RUN: llvm-lto2 run -r %t1,main,plx -o %t2 %t1
 ; RUN: llvm-nm %t2.1 | FileCheck %s
 
-; We check that LTO will be aware of target triple and prevent exp2 to ldexpf
+; We check that LTO will be aware of target triple and prevent pow to exp10
 ; transformation on Windows.
-; CHECK: U exp2f
+; CHECK: U powf
 
 target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc19.11.0"
 
+declare float @llvm.pow.f32(float, float)
+
 define dso_local i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr {
 entry:
   %conv = sitofp i32 %argc to float
-  %exp2 = tail call float @llvm.exp2.f32(float %conv)
+  %exp2 = tail call float @llvm.pow.f32(float 10.0, float %conv)
   %conv1 = fptosi float %exp2 to i32
   ret i32 %conv1
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare float @llvm.exp2.f32(float)
-
diff --git a/llvm/test/MC/AsmParser/layout-interdependency.s b/llvm/test/MC/AsmParser/layout-interdependency.s
deleted file mode 100644
index d275614..0000000
--- a/llvm/test/MC/AsmParser/layout-interdependency.s
+++ /dev/null
@@ -1,12 +0,0 @@
-# RUN: not llvm-mc --filetype=obj %s -o /dev/null 2>&1 | FileCheck %s
-# REQUIRES: object-emission
-# UNSUPPORTED: target={{.*}}-zos{{.*}}
-
-fct_end:
-
-# CHECK: layout-interdependency.s:[[#@LINE+1]]:7: error: expected assembly-time absolute expression
-.fill (data_start - fct_end), 1, 42
-# CHECK: layout-interdependency.s:[[#@LINE+1]]:7: error: expected assembly-time absolute expression
-.fill (fct_end - data_start), 1, 42
-
-data_start:
diff --git a/llvm/test/MC/Disassembler/Sparc/sparc-v9-asi.txt b/llvm/test/MC/Disassembler/Sparc/sparc-v9-asi.txt
index 9286814..1d1fdb6 100644
--- a/llvm/test/MC/Disassembler/Sparc/sparc-v9-asi.txt
+++ b/llvm/test/MC/Disassembler/Sparc/sparc-v9-asi.txt
@@ -32,3 +32,8 @@
 0xd5 0xf6 0x11 0x56
 # V9: casxa [%i0] #ASI_SNF_L, %l6, %o2
 0xd5 0xf6 0x11 0x76
+
+# V9: prefetcha  [%i1+3968] %asi, #one_read
+0xc3 0xee 0x6f 0x80
+# V9: prefetcha [%i1+%i2] #ASI_SNF, #one_read
+0xc3 0xee 0x50 0x7a
diff --git a/llvm/test/MC/Disassembler/Sparc/sparc-v9.txt b/llvm/test/MC/Disassembler/Sparc/sparc-v9.txt
index da278e1..d561216 100644
--- a/llvm/test/MC/Disassembler/Sparc/sparc-v9.txt
+++ b/llvm/test/MC/Disassembler/Sparc/sparc-v9.txt
@@ -132,11 +132,65 @@
 # CHECK: membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore | #Lookaside | #MemIssue | #Sync
 0x81 0x43 0xe0 0x7f
 
-# CHECK: prefetch  [%i1+3968], 1
-0xc3,0x6e,0x6f,0x80
+# CHECK: prefetch  [%i1+3968], #n_reads
+0xc1 0x6e 0x6f 0x80
 
-# CHECK: prefetch  [%i1+%i2], 1
-0xc3,0x6e,0x40,0x1a
+# CHECK: prefetch  [%i1+3968], #one_read
+0xc3 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #n_writes
+0xc5 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #one_write
+0xc7 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #page
+0xc9 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #unified
+0xe3 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #n_reads_strong
+0xe9 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #one_read_strong
+0xeb 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #n_writes_strong
+0xed 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+3968], #one_write_strong
+0xef 0x6e 0x6f 0x80
+
+# CHECK: prefetch  [%i1+%i2], #n_reads
+0xc1 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #one_read
+0xc3 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #n_writes
+0xc5 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #one_write
+0xc7 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #page
+0xc9 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #unified
+0xe3 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #n_reads_strong
+0xe9 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #one_read_strong
+0xeb 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #n_writes_strong
+0xed 0x6e 0x40 0x1a
+
+# CHECK: prefetch  [%i1+%i2], #one_write_strong
+0xef 0x6e 0x40 0x1a
 
 # CHECK: done
 0x81,0xf0,0x00,0x00
diff --git a/llvm/test/MC/ELF/layout-interdependency.s b/llvm/test/MC/ELF/layout-interdependency.s
new file mode 100644
index 0000000..5ebcdfa
--- /dev/null
+++ b/llvm/test/MC/ELF/layout-interdependency.s
@@ -0,0 +1,10 @@
+# RUN: not llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s
+
+fct_end:
+
+# CHECK: layout-interdependency.s:[[#@LINE+1]]:7: error: invalid number of bytes
+.fill (data_start - fct_end), 1, 42
+# CHECK: layout-interdependency.s:[[#@LINE+1]]:7: error: invalid number of bytes
+.fill (fct_end - data_start), 1, 42
+
+data_start:
diff --git a/llvm/test/MC/ELF/relax-recompute-align.s b/llvm/test/MC/ELF/relax-recompute-align.s
new file mode 100644
index 0000000..44e1f1f
--- /dev/null
+++ b/llvm/test/MC/ELF/relax-recompute-align.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -filetype=obj -triple i386 %s -o - | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+
+/// This is a case where the computed layout is not optimal. The
+// issue is that after the first jmp slides, the .align size must be
+// recomputed -- otherwise the second jump will appear to be out-of-range for a
+// 1-byte jump.
+
+// CHECK:            int3
+// CHECK-NEXT:  d2:  int3
+// CHECK:       e0:  pushal
+// CHECK:      140:  jl      0xe0
+
+L0:
+        .space 0x8a, 0x90
+	jmp	L0
+        .space (0xb3 - 0x8f), 0x90
+	jle	L2
+        .space (0xcd - 0xb5), 0x90
+	.p2align 4, 0xcc
+L1:
+        .space (0x130 - 0xd0),0x60
+	jl	L1
+L2:
diff --git a/llvm/test/MC/ELF/subsection-if.s b/llvm/test/MC/ELF/subsection-if.s
index 7f2cba6..905cb5a 100644
--- a/llvm/test/MC/ELF/subsection-if.s
+++ b/llvm/test/MC/ELF/subsection-if.s
@@ -1,8 +1,10 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
 # RUN: llvm-readelf -x .text %t | FileCheck %s
-# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
+# RUN: llvm-mc -filetype=obj -triple=x86_64 --defsym ERR=1 %s -o %t1
+# RUN: llvm-readelf -x .text %t1 | FileCheck %s --check-prefix=CHECK1
 
 # CHECK: 0x00000000 9090
+# CHECK1: 0x00000000 90909090 90
 
 .subsection 1
 661:
diff --git a/llvm/test/MC/MachO/relax-recompute-align.s b/llvm/test/MC/MachO/relax-recompute-align.s
deleted file mode 100644
index bfb17b71..0000000
--- a/llvm/test/MC/MachO/relax-recompute-align.s
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -S - | FileCheck %s
-
-// FIXME: This is a horrible way of checking the output, we need an llvm-mc
-// based 'otool'.
-
-// This is a case where llvm-mc computes a better layout than Darwin 'as'. This
-// issue is that after the first jmp slides, the .align size must be
-// recomputed -- otherwise the second jump will appear to be out-of-range for a
-// 1-byte jump.
-
-L0:
-        .space 0x8a, 0x90
-	jmp	L0
-        .space (0xb3 - 0x8f), 0x90
-	jle	L2
-        .space (0xcd - 0xb5), 0x90
-	.align	4, 0x90
-L1:
-        .space (0x130 - 0xd0),0x90
-	jl	L1
-L2:
-
-.zerofill __DATA,__bss,_sym,4,2
-
-// CHECK: Section {
-// CHECK-NEXT:   Index: 0
-// CHECK-NEXT:   Name: __text (5F 5F 74 65 78 74 00 00 00 00 00 00 00 00 00 00)
-// CHECK-NEXT:   Segment: __TEXT (5F 5F 54 45 58 54 00 00 00 00 00 00 00 00 00 00)
-// CHECK-NEXT:   Address: 0x0
-// CHECK-NEXT:   Size: 0x132
-// CHECK-NEXT:   Offset: 340
-// CHECK-NEXT:   Alignment: 4
-// CHECK-NEXT:   RelocationOffset: 0x0
-// CHECK-NEXT:   RelocationCount: 0
-// CHECK-NEXT:   Type: Regular (0x0)
-// CHECK-NEXT:   Attributes [ (0x800004)
-// CHECK-NEXT:     PureInstructions (0x800000)
-// CHECK-NEXT:     SomeInstructions (0x4)
-// CHECK-NEXT:   ]
-// CHECK-NEXT:   Reserved1: 0x0
-// CHECK-NEXT:   Reserved2: 0x0
-// CHECK-NEXT: }
diff --git a/llvm/test/MC/RISCV/rvi-aliases-valid.s b/llvm/test/MC/RISCV/rvi-aliases-valid.s
index 098d5c1..9ac6a8a 100644
--- a/llvm/test/MC/RISCV/rvi-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvi-aliases-valid.s
@@ -190,6 +190,16 @@ jalr x25, x26, 11
 # CHECK-S-OBJ-NOALIAS: jalr zero, 0(ra)
 # CHECK-S-OBJ: ret
 ret
+# CHECK-S-OBJ-NOALIAS: jalr zero, 0(s11)
+# CHECK-S-OBJ: jr s11
+jr (x27)
+# CHECK-S-OBJ-NOALIAS: jalr ra, 0(t3)
+# CHECK-S-OBJ: jalr t3
+jalr (x28)
+# CHECK-S-OBJ-NOALIAS: jalr t4, 0(t5)
+# CHECK-S-OBJ: jalr t4, t5
+jalr x29, (x30)
+
 # TODO call
 # TODO tail
 
diff --git a/llvm/test/MC/Sparc/sparc-relocations.s b/llvm/test/MC/Sparc/sparc-relocations.s
index d99ddb7..82314e4 100644
--- a/llvm/test/MC/Sparc/sparc-relocations.s
+++ b/llvm/test/MC/Sparc/sparc-relocations.s
@@ -10,6 +10,8 @@
         ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_M44 sym
         ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_L44 sym
         ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_HH22 sym
+        ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_HH22 sym
+        ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_HM10 sym
         ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_HM10 sym
         ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_LM22 sym
         ! CHECK-OBJ-NEXT: 0x{{[0-9,A-F]+}} R_SPARC_13 sym
@@ -49,10 +51,18 @@
         ! CHECK-NEXT:                 !   fixup A - offset: 0, value: %hh(sym), kind: fixup_sparc_hh
         sethi %hh(sym), %l0
 
+        ! CHECK: sethi %hh(sym), %l0  ! encoding: [0x21,0b00AAAAAA,A,A]
+        ! CHECK-NEXT:                 !   fixup A - offset: 0, value: %hh(sym), kind: fixup_sparc_hh
+        sethi %uhi(sym), %l0
+
         ! CHECK: or %g1, %hm(sym), %g3 ! encoding: [0x86,0x10,0b011000AA,A]
         ! CHECK-NEXT:                  !   fixup A - offset: 0, value: %hm(sym), kind: fixup_sparc_hm
         or %g1, %hm(sym), %g3
 
+        ! CHECK: or %g1, %hm(sym), %g3 ! encoding: [0x86,0x10,0b011000AA,A]
+        ! CHECK-NEXT:                  !   fixup A - offset: 0, value: %hm(sym), kind: fixup_sparc_hm
+        or %g1, %ulo(sym), %g3
+
         ! CHECK: sethi %lm(sym), %l0  ! encoding: [0x21,0b00AAAAAA,A,A]
         ! CHECK-NEXT:                 !   fixup A - offset: 0, value: %lm(sym), kind: fixup_sparc_lm
         sethi %lm(sym), %l0
diff --git a/llvm/test/MC/Sparc/sparc64-ctrl-instructions.s b/llvm/test/MC/Sparc/sparc64-ctrl-instructions.s
index a21b175..1889473 100644
--- a/llvm/test/MC/Sparc/sparc64-ctrl-instructions.s
+++ b/llvm/test/MC/Sparc/sparc64-ctrl-instructions.s
@@ -1191,28 +1191,36 @@
         brz,a,pn   %g1, .BB0
 
         ! CHECK: movrz   %g1, %g2, %g3 ! encoding: [0x87,0x78,0x44,0x02]
+        ! CHECK: movrz   %g1, %g2, %g3 ! encoding: [0x87,0x78,0x44,0x02]
         ! CHECK: movrlez %g1, %g2, %g3 ! encoding: [0x87,0x78,0x48,0x02]
         ! CHECK: movrlz  %g1, %g2, %g3 ! encoding: [0x87,0x78,0x4c,0x02]
         ! CHECK: movrnz  %g1, %g2, %g3 ! encoding: [0x87,0x78,0x54,0x02]
+        ! CHECK: movrnz  %g1, %g2, %g3 ! encoding: [0x87,0x78,0x54,0x02]
         ! CHECK: movrgz  %g1, %g2, %g3 ! encoding: [0x87,0x78,0x58,0x02]
         ! CHECK: movrgez %g1, %g2, %g3 ! encoding: [0x87,0x78,0x5c,0x02]
         movrz   %g1, %g2, %g3
+        movre   %g1, %g2, %g3
         movrlez %g1, %g2, %g3
         movrlz  %g1, %g2, %g3
         movrnz  %g1, %g2, %g3
+        movrne  %g1, %g2, %g3
         movrgz  %g1, %g2, %g3
         movrgez %g1, %g2, %g3
 
         ! CHECK: movrz   %g1, 2, %g3 ! encoding: [0x87,0x78,0x64,0x02]
+        ! CHECK: movrz   %g1, 2, %g3 ! encoding: [0x87,0x78,0x64,0x02]
         ! CHECK: movrlez %g1, 2, %g3 ! encoding: [0x87,0x78,0x68,0x02]
         ! CHECK: movrlz  %g1, 2, %g3 ! encoding: [0x87,0x78,0x6c,0x02]
         ! CHECK: movrnz  %g1, 2, %g3 ! encoding: [0x87,0x78,0x74,0x02]
+        ! CHECK: movrnz  %g1, 2, %g3 ! encoding: [0x87,0x78,0x74,0x02]
         ! CHECK: movrgz  %g1, 2, %g3 ! encoding: [0x87,0x78,0x78,0x02]
         ! CHECK: movrgez %g1, 2, %g3 ! encoding: [0x87,0x78,0x7c,0x02]
         movrz   %g1, 2, %g3
+        movre   %g1, 2, %g3
         movrlez %g1, 2, %g3
         movrlz  %g1, 2, %g3
         movrnz  %g1, 2, %g3
+        movrne  %g1, 2, %g3
         movrgz  %g1, 2, %g3
         movrgez %g1, 2, %g3
 
diff --git a/llvm/test/MC/Sparc/sparcv9-instructions.s b/llvm/test/MC/Sparc/sparcv9-instructions.s
index 0ca2e50..1b11171 100644
--- a/llvm/test/MC/Sparc/sparcv9-instructions.s
+++ b/llvm/test/MC/Sparc/sparcv9-instructions.s
@@ -7,6 +7,16 @@
         addc %g2, %g1, %g3
 
         ! V8:      error: invalid instruction mnemonic
+        ! V8-NEXT: addc %g2, 1, %g3
+        ! V9:      addx %g2, 1, %g3              ! encoding: [0x86,0x40,0xa0,0x01]
+        addc %g2, 1, %g3
+
+        ! V8:      error: invalid instruction mnemonic
+        ! V8-NEXT: addc 1, %g2, %g3
+        ! V9:      addx %g2, 1, %g3              ! encoding: [0x86,0x40,0xa0,0x01]
+        addc 1, %g2, %g3
+
+        ! V8:      error: invalid instruction mnemonic
         ! V8-NEXT: addccc %g1, %g2, %g3
         ! V9:      addxcc %g1, %g2, %g3            ! encoding: [0x86,0xc0,0x40,0x02]
         addccc %g1, %g2, %g3
@@ -492,6 +502,10 @@
         wr %i0, %i1, %ccr
         ! V9: wr %i0, 1, %ccr           ! encoding: [0x85,0x86,0x20,0x01]
         wr %i0, 1, %ccr
+        ! V9: wr %i0, 1, %asr20         ! encoding: [0xa9,0x86,0x20,0x01]
+        wr %i0, 1, %set_softint
+        ! V9: wr %i0, 1, %asr21         ! encoding: [0xab,0x86,0x20,0x01]
+        wr %i0, 1, %clear_softint
 
         ! V9: st %o1, [%o0]             ! encoding: [0xd2,0x22,0x00,0x00]
         stw %o1, [%o0]
@@ -523,16 +537,146 @@
         ! V9: stxa %g0, [%g2+%i5] #ASI_SNF   ! encoding: [0xc0,0xf0,0x90,0x7d]
         stxa %g0, [%g2 + %i5] #ASI_SNF
 
-        ! V8:      error: instruction requires a CPU feature not currently enabled
+        ! V8:      error: invalid operand for instruction
         ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], 1
-        ! V9: prefetch  [%i1+3968], 1  ! encoding: [0xc3,0x6e,0x6f,0x80]
+        ! V9: prefetch  [%i1+3968], #one_read  ! encoding: [0xc3,0x6e,0x6f,0x80]
         prefetch  [ %i1 + 0xf80 ], 1
 
-        ! V8:      error: instruction requires a CPU feature not currently enabled
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #n_reads
+        ! V9: prefetch  [%i1+3968], #n_reads  ! encoding: [0xc1,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #n_reads
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #one_read
+        ! V9: prefetch  [%i1+3968], #one_read  ! encoding: [0xc3,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #one_read
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #n_writes
+        ! V9: prefetch  [%i1+3968], #n_writes  ! encoding: [0xc5,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #n_writes
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #one_write
+        ! V9: prefetch  [%i1+3968], #one_write  ! encoding: [0xc7,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #one_write
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #page
+        ! V9: prefetch  [%i1+3968], #page  ! encoding: [0xc9,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #page
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #unified
+        ! V9: prefetch  [%i1+3968], #unified  ! encoding: [0xe3,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #unified
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #n_reads_strong
+        ! V9: prefetch  [%i1+3968], #n_reads_strong  ! encoding: [0xe9,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #n_reads_strong
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #one_read_strong
+        ! V9: prefetch  [%i1+3968], #one_read_strong  ! encoding: [0xeb,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #one_read_strong
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #n_writes_strong
+        ! V9: prefetch  [%i1+3968], #n_writes_strong  ! encoding: [0xed,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #n_writes_strong
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + 0xf80 ], #one_write_strong
+        ! V9: prefetch  [%i1+3968], #one_write_strong  ! encoding: [0xef,0x6e,0x6f,0x80]
+        prefetch  [ %i1 + 0xf80 ], #one_write_strong
+
+        ! V8:      error: invalid operand for instruction
         ! V8-NEXT: prefetch  [ %i1 + %i2 ], 1
-        ! V9: prefetch  [%i1+%i2], 1  ! encoding: [0xc3,0x6e,0x40,0x1a]
+        ! V9: prefetch  [%i1+%i2], #one_read  ! encoding: [0xc3,0x6e,0x40,0x1a]
         prefetch  [ %i1 + %i2 ], 1
 
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #n_reads
+        ! V9: prefetch  [%i1+%i2], #n_reads  ! encoding: [0xc1,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #n_reads
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #one_read
+        ! V9: prefetch  [%i1+%i2], #one_read  ! encoding: [0xc3,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #one_read
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #n_writes
+        ! V9: prefetch  [%i1+%i2], #n_writes  ! encoding: [0xc5,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #n_writes
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #one_write
+        ! V9: prefetch  [%i1+%i2], #one_write  ! encoding: [0xc7,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #one_write
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #page
+        ! V9: prefetch  [%i1+%i2], #page  ! encoding: [0xc9,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #page
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #unified
+        ! V9: prefetch  [%i1+%i2], #unified  ! encoding: [0xe3,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #unified
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #n_reads_strong
+        ! V9: prefetch  [%i1+%i2], #n_reads_strong  ! encoding: [0xe9,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #n_reads_strong
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #one_read_strong
+        ! V9: prefetch  [%i1+%i2], #one_read_strong  ! encoding: [0xeb,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #one_read_strong
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #n_writes_strong
+        ! V9: prefetch  [%i1+%i2], #n_writes_strong  ! encoding: [0xed,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #n_writes_strong
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetch  [ %i1 + %i2 ], #one_write_strong
+        ! V9: prefetch  [%i1+%i2], #one_write_strong  ! encoding: [0xef,0x6e,0x40,0x1a]
+        prefetch  [ %i1 + %i2 ], #one_write_strong
+
+        ! V8:      error: malformed ASI tag, must be a constant integer expression
+        ! V8-NEXT: prefetcha  [ %i1 + 0xf80 ] %asi, 1
+        ! V9: prefetcha [%i1+3968] %asi, #one_read    ! encoding: [0xc3,0xee,0x6f,0x80]
+        prefetcha  [ %i1 + 0xf80 ] %asi, 1
+
+        ! V8:      error: malformed ASI tag, must be a constant integer expression
+        ! V8-NEXT: prefetcha  [ %i1 + 0xf80 ] %asi, #one_read
+        ! V9: prefetcha [%i1+3968] %asi, #one_read    ! encoding: [0xc3,0xee,0x6f,0x80]
+        prefetcha  [ %i1 + 0xf80 ] %asi, #one_read
+
+        ! V8:      error: malformed ASI tag, must be a constant integer expression
+        ! V8-NEXT: prefetcha  [ %i1 + %i2 ] #ASI_SNF, 1
+        ! V9: prefetcha [%i1+%i2] #ASI_SNF, #one_read ! encoding: [0xc3,0xee,0x50,0x7a]
+        prefetcha  [ %i1 + %i2 ] #ASI_SNF, 1
+
+        ! V8:      error: malformed ASI tag, must be a constant integer expression
+        ! V8-NEXT: prefetcha  [ %i1 + %i2 ] #ASI_SNF, #one_read
+        ! V9: prefetcha [%i1+%i2] #ASI_SNF, #one_read ! encoding: [0xc3,0xee,0x50,0x7a]
+        prefetcha  [ %i1 + %i2 ] #ASI_SNF, #one_read
+
+        ! V8:      error: invalid operand for instruction
+        ! V8-NEXT: prefetcha  [ %i1 + %i2 ] 131, 1
+        ! V9: prefetcha [%i1+%i2] #ASI_SNF, #one_read ! encoding: [0xc3,0xee,0x50,0x7a]
+        prefetcha  [ %i1 + %i2 ] 131, 1
+
+        ! V8:      error: unexpected token
+        ! V8-NEXT: prefetcha  [ %i1 + %i2 ] 131, #one_read
+        ! V9: prefetcha [%i1+%i2] #ASI_SNF, #one_read ! encoding: [0xc3,0xee,0x50,0x7a]
+        prefetcha  [ %i1 + %i2 ] 131, #one_read
+
         ! V8:      error: instruction requires a CPU feature not currently enabled
         ! V8-NEXT: done
         ! V9: done      ! encoding: [0x81,0xf0,0x00,0x00]
diff --git a/llvm/test/Other/constant-fold-gep.ll b/llvm/test/Other/constant-fold-gep.ll
index d4bd24d..e0d535e 100644
--- a/llvm/test/Other/constant-fold-gep.ll
+++ b/llvm/test/Other/constant-fold-gep.ll
@@ -104,7 +104,7 @@
 
 ; Fold GEP of a GEP. Very simple cases are folded without targetdata.
 
-; PLAIN: @Y = global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 2)
+; PLAIN: @Y = global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 1), i64 1)
 ; PLAIN: @Z = global ptr getelementptr inbounds (i32, ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 0, i64 1, i32 0), i64 1)
 ; OPT: @Y = local_unnamed_addr global ptr getelementptr inbounds (i8, ptr @ext, i64 48)
 ; OPT: @Z = local_unnamed_addr global ptr getelementptr inbounds (i8, ptr @ext, i64 12)
diff --git a/llvm/test/Other/lit-unicode.txt b/llvm/test/Other/lit-unicode.txt
index b375fc50..2f400014 100644
--- a/llvm/test/Other/lit-unicode.txt
+++ b/llvm/test/Other/lit-unicode.txt
@@ -1,4 +1,5 @@
 FIXME: See if we can fix this in lit by using Unicode strings.
+REQUIRES: shell
 
 RUN: echo "ようこそ" | FileCheck %s
 CHECK: {{^}}ようこそ{{$}}
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 56bce5b..3af3b04 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -45,6 +45,7 @@ def TDL_DirA : Directive<"dira"> {
   ];
   let isDefault = 1;
   let association = AS_None;
+  let category = CA_Executable;
 }
 
 // CHECK:       #ifndef LLVM_Tdl_INC
@@ -71,6 +72,17 @@ def TDL_DirA : Directive<"dira"> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
+// CHECK-NEXT:  enum class Category {
+// CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    Executable,
+// CHECK-NEXT:    Informational,
+// CHECK-NEXT:    Meta,
+// CHECK-NEXT:    Subsidiary,
+// CHECK-NEXT:    Utility,
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
+// CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
 // CHECK-NEXT:  };
@@ -115,6 +127,7 @@ def TDL_DirA : Directive<"dira"> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  constexpr std::size_t getMaxLeafCount() { return 0; }
 // CHECK-NEXT:  Association getDirectiveAssociation(Directive D);
+// CHECK-NEXT:  Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  AKind getAKind(StringRef);
 // CHECK-NEXT:  llvm::StringRef getTdlAKindName(AKind);
 // CHECK-EMPTY:
@@ -364,7 +377,15 @@ def TDL_DirA : Directive<"dira"> {
 // IMPL-NEXT:    switch (Dir) {
 // IMPL-NEXT:    case llvm::tdl::Directive::TDLD_dira:
 // IMPL-NEXT:      return llvm::tdl::Association::None;
-// IMPL-NEXT:    } // switch(Dir)
+// IMPL-NEXT:    } // switch (Dir)
+// IMPL-NEXT:    llvm_unreachable("Unexpected directive");
+// IMPL-NEXT:  }
+// IMPL-EMPTY:
+// IMPL-NEXT:  llvm::tdl::Category llvm::tdl::getDirectiveCategory(llvm::tdl::Directive Dir) {
+// IMPL-NEXT:    switch (Dir) {
+// IMPL-NEXT:    case llvm::tdl::TDLD_dira:
+// IMPL-NEXT:      return llvm::tdl::Category::Executable;
+// IMPL-NEXT:    } // switch (Dir)
 // IMPL-NEXT:    llvm_unreachable("Unexpected directive");
 // IMPL-NEXT:  }
 // IMPL-EMPTY:
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index 3c09211..209901b 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -39,6 +39,7 @@ def TDL_DirA : Directive<"dira"> {
   ];
   let isDefault = 1;
   let association = AS_Block;
+  let category = CA_Declarative;
 }
 
 // CHECK:       #ifndef LLVM_Tdl_INC
@@ -62,6 +63,17 @@ def TDL_DirA : Directive<"dira"> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
+// CHECK-NEXT:  enum class Category {
+// CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    Executable,
+// CHECK-NEXT:    Informational,
+// CHECK-NEXT:    Meta,
+// CHECK-NEXT:    Subsidiary,
+// CHECK-NEXT:    Utility,
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
+// CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
 // CHECK-NEXT:  };
@@ -91,6 +103,7 @@ def TDL_DirA : Directive<"dira"> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  constexpr std::size_t getMaxLeafCount() { return 0; }
 // CHECK-NEXT:  Association getDirectiveAssociation(Directive D);
+// CHECK-NEXT:  Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  } // namespace tdl
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
@@ -295,7 +308,15 @@ def TDL_DirA : Directive<"dira"> {
 // IMPL-NEXT:    switch (Dir) {
 // IMPL-NEXT:    case llvm::tdl::Directive::TDLD_dira:
 // IMPL-NEXT:      return llvm::tdl::Association::Block;
-// IMPL-NEXT:    } // switch(Dir)
+// IMPL-NEXT:    } // switch (Dir)
+// IMPL-NEXT:    llvm_unreachable("Unexpected directive");
+// IMPL-NEXT:  }
+// IMPL-EMPTY:
+// IMPL-NEXT:  llvm::tdl::Category llvm::tdl::getDirectiveCategory(llvm::tdl::Directive Dir) {
+// IMPL-NEXT:    switch (Dir) {
+// IMPL-NEXT:    case llvm::tdl::TDLD_dira:
+// IMPL-NEXT:      return llvm::tdl::Category::Declarative;
+// IMPL-NEXT:    } // switch (Dir)
 // IMPL-NEXT:    llvm_unreachable("Unexpected directive");
 // IMPL-NEXT:  }
 // IMPL-EMPTY:
diff --git a/llvm/test/Transforms/InstCombine/2008-05-31-AddBool.ll b/llvm/test/Transforms/InstCombine/2008-05-31-AddBool.ll
index 9bc0260..707c34b 100644
--- a/llvm/test/Transforms/InstCombine/2008-05-31-AddBool.ll
+++ b/llvm/test/Transforms/InstCombine/2008-05-31-AddBool.ll
@@ -1,9 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 ; PR2389
 
-; CHECK: xor
-
 define i1 @test(i1 %a, i1 %b) {
-  %A = add i1 %a, %b
-  ret i1 %A
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = xor i1 [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = add i1 %a, %b
+  ret i1 %r
 }
diff --git a/llvm/test/Transforms/InstCombine/2008-05-31-Bools.ll b/llvm/test/Transforms/InstCombine/2008-05-31-Bools.ll
index e7dd74b..fa7c542 100644
--- a/llvm/test/Transforms/InstCombine/2008-05-31-Bools.ll
+++ b/llvm/test/Transforms/InstCombine/2008-05-31-Bools.ll
@@ -1,24 +1,40 @@
-; RUN: opt < %s -passes=instcombine -S > %t
-; RUN: grep "xor" %t
-; RUN: grep "and" %t
-; RUN: not grep "div" %t
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i1 @foo1(i1 %a, i1 %b) {
-  %A = sub i1 %a, %b
-  ret i1 %A
+; CHECK-LABEL: define i1 @foo1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = xor i1 [[B]], [[A]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = sub i1 %a, %b
+  ret i1 %r
 }
 
 define i1 @foo2(i1 %a, i1 %b) {
-  %A = mul i1 %a, %b
-  ret i1 %A
+; CHECK-LABEL: define i1 @foo2(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = mul i1 %a, %b
+  ret i1 %r
 }
 
 define i1 @foo3(i1 %a, i1 %b) {
-  %A = udiv i1 %a, %b
-  ret i1 %A
+; CHECK-LABEL: define i1 @foo3(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 [[A]]
+;
+  %r = udiv i1 %a, %b
+  ret i1 %r
 }
 
 define i1 @foo4(i1 %a, i1 %b) {
-  %A = sdiv i1 %a, %b
-  ret i1 %A
+; CHECK-LABEL: define i1 @foo4(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 [[A]]
+;
+  %r = sdiv i1 %a, %b
+  ret i1 %r
 }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
index 4566865..7e9d122 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
@@ -2,423 +2,6 @@
 ; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
 
 ; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret float %data
-}
-
-define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v1f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <1 x float> [[DATA]]
-;
-  %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret <1 x float> %data
-}
-
-define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <4 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <2 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 3
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <3 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT:    ret { float, float } [[INS1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt1 = extractelement <4 x float> %data, i32 1
-  %ins0 = insertvalue { float, float } undef, float %elt0, 0
-  %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
-  ret { float, float } %ins1
-}
-
-define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1
-; CHECK-NEXT:    [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT:    [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2
-; CHECK-NEXT:    ret { float, float, float } [[INS2]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt1 = extractelement <4 x float> %data, i32 1
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
-  %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
-  %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
-  ret { float, float, float } %ins2
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT:    ret <2 x float> [[RET]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertelement <2 x float> poison, float %elt0, i32 0
-  %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 4, i32 1>
-  %ret = fadd <2 x float> %ins1, %shuf
-  ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT:    ret <2 x float> [[RET]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertelement <2 x float> poison, float %elt0, i32 0
-  %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
-  %shuf = shufflevector <4 x float> poison, <4 x float> %data, <2 x i32> <i32 5, i32 1>
-  %ret = fadd <2 x float> %ins1, %shuf
-  ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT:    ret <2 x float> [[RET]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertelement <2 x float> poison, float %elt2, i32 0
-  %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
-  %shuf = shufflevector <4 x float> %data, <4 x float> %data, <2 x i32> <i32 0, i32 5>
-  %ret = fadd <2 x float> %ins1, %shuf
-  ret <2 x float> %ret
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <3 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath [[META0:![0-9]+]]
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load.format
-; --------------------------------------------------------------------
-
-define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_format_v1f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true)
-; CHECK-NEXT:    ret <1 x float> [[DATA]]
-;
-  %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
-  ret <1 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-; The initial insertion point is at the extractelement
-define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double>
-; CHECK-NEXT:    [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0
-; CHECK-NEXT:    ret double [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %var1 = bitcast <4 x float> %var to <2 x double>
-  %var2 = extractelement <2 x double> %var1, i32 0
-  ret double %var2
-}
-
-define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT:    ret i32 [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %var1 = bitcast <4 x float> %var to <4 x i32>
-  %var2 = extractelement <4 x i32> %var1, i32 0
-  ret i32 %var2
-}
-
-define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT:    [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT:    ret i16 [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %var1 = bitcast <4 x float> %var to <8 x i16>
-  %var2 = extractelement <8 x i16> %var1, i32 0
-  ret i16 %var2
-}
-
-declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
 ; llvm.amdgcn.raw.buffer.load
 ; --------------------------------------------------------------------
 
@@ -665,7 +248,7 @@ define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32
 
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
 ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0]]
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
@@ -4497,248 +4080,6 @@ declare <4 x float> @llvm.amdgcn.struct.ptr.tbuffer.load.v4f32(ptr addrspace(8),
 declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1
 
 ; --------------------------------------------------------------------
-; llvm.amdgcn.tbuffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  ret float %data
-}
-
-define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <4 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <2 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 3
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <3 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt0 = extractelement <3 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT:    ret i32 [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %var1 = bitcast <4 x float> %var to <4 x i32>
-  %var2 = extractelement <4 x i32> %var1, i32 0
-  ret i32 %var2
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath [[META0]]
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
-declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample
 ; --------------------------------------------------------------------
 
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index 598175b..af12367 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -2,423 +2,6 @@
 ; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
 
 ; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret float %data
-}
-
-define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v1f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <1 x float> [[DATA]]
-;
-  %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret <1 x float> %data
-}
-
-define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <4 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <2 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 3
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <3 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT:    ret { float, float } [[INS1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt1 = extractelement <4 x float> %data, i32 1
-  %ins0 = insertvalue { float, float } undef, float %elt0, 0
-  %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
-  ret { float, float } %ins1
-}
-
-define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1
-; CHECK-NEXT:    [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT:    [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2
-; CHECK-NEXT:    ret { float, float, float } [[INS2]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt1 = extractelement <4 x float> %data, i32 1
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
-  %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
-  %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
-  ret { float, float, float } %ins2
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT:    ret <2 x float> [[RET]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertelement <2 x float> undef, float %elt0, i32 0
-  %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 4, i32 1>
-  %ret = fadd <2 x float> %ins1, %shuf
-  ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT:    ret <2 x float> [[RET]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertelement <2 x float> undef, float %elt0, i32 0
-  %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
-  %shuf = shufflevector <4 x float> poison, <4 x float> %data, <2 x i32> <i32 5, i32 1>
-  %ret = fadd <2 x float> %ins1, %shuf
-  ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT:    ret <2 x float> [[RET]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt2 = extractelement <4 x float> %data, i32 2
-  %ins0 = insertelement <2 x float> undef, float %elt2, i32 0
-  %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
-  %shuf = shufflevector <4 x float> %data, <4 x float> %data, <2 x i32> <i32 0, i32 5>
-  %ret = fadd <2 x float> %ins1, %shuf
-  ret <2 x float> %ret
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt0 = extractelement <3 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath [[META0:![0-9]+]]
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load.format
-; --------------------------------------------------------------------
-
-define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_format_v1f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true)
-; CHECK-NEXT:    ret <1 x float> [[DATA]]
-;
-  %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
-  ret <1 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-; The initial insertion point is at the extractelement
-define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double>
-; CHECK-NEXT:    [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0
-; CHECK-NEXT:    ret double [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %var1 = bitcast <4 x float> %var to <2 x double>
-  %var2 = extractelement <2 x double> %var1, i32 0
-  ret double %var2
-}
-
-define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT:    ret i32 [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %var1 = bitcast <4 x float> %var to <4 x i32>
-  %var2 = extractelement <4 x i32> %var1, i32 0
-  ret i32 %var2
-}
-
-define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT:    [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT:    ret i16 [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %var1 = bitcast <4 x float> %var to <8 x i16>
-  %var2 = extractelement <8 x i16> %var1, i32 0
-  ret i16 %var2
-}
-
-declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
 ; llvm.amdgcn.raw.buffer.load
 ; --------------------------------------------------------------------
 
@@ -665,7 +248,7 @@ define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32
 
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
 ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0]]
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[DATA]]
 ;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
@@ -4496,248 +4079,6 @@ declare <4 x float> @llvm.amdgcn.struct.ptr.tbuffer.load.v4f32(ptr addrspace(8),
 declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1
 
 ; --------------------------------------------------------------------
-; llvm.amdgcn.tbuffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  ret float %data
-}
-
-define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <4 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <2 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt0 = extractelement <4 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <4 x float> %data, i32 3
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <3 x float> [[DATA]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT:    ret <3 x float> [[SHUF]]
-;
-  %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-  ret <3 x float> %shuf
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt0 = extractelement <3 x float> %data, i32 0
-  ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 1
-  ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT:    ret float [[ELT1]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %elt1 = extractelement <3 x float> %data, i32 2
-  ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    ret <2 x float> [[DATA]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    ret <2 x float> [[SHUF]]
-;
-  %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-  ret <2 x float> %shuf
-}
-
-define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
-; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT:    ret i32 [[VAR2]]
-;
-  %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %var1 = bitcast <4 x float> %var to <4 x i32>
-  %var2 = extractelement <4 x i32> %var1, i32 0
-  ret i32 %var2
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath [[META0]]
-; CHECK-NEXT:    ret float [[DATA]]
-;
-  %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
-  %elt0 = extractelement <2 x float> %data, i32 0
-  ret float %elt0
-}
-
-declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
-declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample
 ; --------------------------------------------------------------------
 
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index 9cef4a3..65c8961 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -72,33 +72,6 @@ define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> in
   ret void
 }
 
-define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) {
-; GCN-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GCN-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0
-; GCN-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; GCN-NEXT:    call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
-; GCN-NEXT:    ret void
-;
-; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
-; GFX12-NEXT:    call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
-; GFX12-NEXT:    ret void
-;
-; GFXUNKNOWN-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFXUNKNOWN-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
-; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
-; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
-; GFXUNKNOWN-NEXT:    ret void
-;
-  %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
-  %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
-  %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
-  %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0)
-  ret void
-}
-
 define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) {
 ; GCN-LABEL: @struct_buffer_store_format_insert_zeros(
 ; GCN-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
@@ -304,11 +277,9 @@ define amdgpu_ps void @struct_tbuffer_store_argument_insert_first(<4 x i32> inre
 }
 
 declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2
-declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
 declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2
 declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
 declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
-declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
 declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
 declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 25087fe..239e146 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -1510,6 +1510,46 @@ define i8 @add_like_or_t2_extrause(i8 %x) {
   %r = add i8 %i1, 42
   ret i8 %r
 }
+define i8 @fold_add_constant_preserve_nsw(i8 %x) {
+; CHECK-LABEL: @fold_add_constant_preserve_nsw(
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[X:%.*]], -120
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %or = or disjoint i8 %x, -128
+  %add = add nsw i8 %or, 8
+  ret i8 %add
+}
+define i8 @fold_add_constant_no_nsw(i8 %x) {
+; CHECK-LABEL: @fold_add_constant_no_nsw(
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], 120
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %or = or disjoint i8 %x, -128
+  %add = add nsw i8 %or, -8
+  ret i8 %add
+}
+define i8 @fold_add_constant_preserve_nuw(i8 %x) {
+; CHECK-LABEL: @fold_add_constant_preserve_nuw(
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[X:%.*]], -116
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %or = or disjoint i8 %x, 128
+  %add = add nuw i8 %or, 12
+  ret i8 %add
+}
+define i32 @sdiv_to_udiv(i32 %arg0, i32 %arg1) {
+; CHECK-LABEL: @sdiv_to_udiv(
+; CHECK-NEXT:    [[T0:%.*]] = shl nuw nsw i32 [[ARG0:%.*]], 8
+; CHECK-NEXT:    [[T2:%.*]] = add nuw nsw i32 [[T0]], 6242049
+; CHECK-NEXT:    [[T3:%.*]] = udiv i32 [[T2]], 192
+; CHECK-NEXT:    ret i32 [[T3]]
+;
+  %t0 = shl nuw nsw i32 %arg0, 8
+  %t1 = or disjoint i32 %t0, 1
+  %t2 = add nuw nsw i32 %t1, 6242048
+  %t3 = sdiv i32 %t2, 192
+  ret i32 %t3
+}
 
 define i8 @add_like_or_disjoint(i8 %x) {
 ; CHECK-LABEL: @add_like_or_disjoint(
diff --git a/llvm/test/Transforms/InstCombine/and-compare.ll b/llvm/test/Transforms/InstCombine/and-compare.ll
index 14379eb..5a9767a 100644
--- a/llvm/test/Transforms/InstCombine/and-compare.ll
+++ b/llvm/test/Transforms/InstCombine/and-compare.ll
@@ -4,6 +4,8 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+declare void @use.i8(i8)
+
 ; Should be optimized to one and.
 define i1 @test1(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test1(
@@ -75,3 +77,98 @@ define <2 x i1> @test3vec(<2 x i64> %A) {
   ret <2 x i1> %cmp
 }
 
+define i1 @test_ne_cp2(i8 %x, i8 %yy) {
+; CHECK-LABEL: @test_ne_cp2(
+; CHECK-NEXT:    [[AND_X_NEG_Y:%.*]] = and i8 [[X:%.*]], -16
+; CHECK-NEXT:    [[AND_X_Y:%.*]] = and i8 [[X]], 16
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_NEG_Y]])
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_Y]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], 31
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %and_x_neg_y = and i8 %x, -16
+  %and_x_y = and i8 %x, 16
+  call void @use.i8(i8 %and_x_neg_y)
+  call void @use.i8(i8 %and_x_y)
+  %r = icmp ne i8 %and_x_neg_y, %and_x_y
+  ret i1 %r
+}
+
+define i1 @test_ne_cp2_2(i8 %x, i8 %yy) {
+; CHECK-LABEL: @test_ne_cp2_2(
+; CHECK-NEXT:    [[AND_X_NEG_Y:%.*]] = and i8 [[X:%.*]], -4
+; CHECK-NEXT:    [[AND_X_Y:%.*]] = and i8 [[X]], 4
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_NEG_Y]])
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_Y]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[X]], 8
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %and_x_neg_y = and i8 %x, -4
+  %and_x_y = and i8 %x, 4
+  call void @use.i8(i8 %and_x_neg_y)
+  call void @use.i8(i8 %and_x_y)
+  %r = icmp eq i8 %and_x_y, %and_x_neg_y
+  ret i1 %r
+}
+
+define i1 @test_ne_cp2_other_okay_all_ones(i8 %x, i8 %yy) {
+; CHECK-LABEL: @test_ne_cp2_other_okay_all_ones(
+; CHECK-NEXT:    [[AND_X_NEG_Y:%.*]] = and i8 [[X:%.*]], -17
+; CHECK-NEXT:    [[AND_X_Y:%.*]] = and i8 [[X]], 16
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_NEG_Y]])
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_Y]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %and_x_neg_y = and i8 %x, -17
+  %and_x_y = and i8 %x, 16
+  call void @use.i8(i8 %and_x_neg_y)
+  call void @use.i8(i8 %and_x_y)
+  %r = icmp ne i8 %and_x_neg_y, %and_x_y
+  ret i1 %r
+}
+
+define i1 @test_ne_cp2_other_fail2(i8 %x, i8 %yy) {
+; CHECK-LABEL: @test_ne_cp2_other_fail2(
+; CHECK-NEXT:    [[AND_X_NEG_Y:%.*]] = and i8 [[X:%.*]], -16
+; CHECK-NEXT:    [[AND_X_Y:%.*]] = and i8 [[X]], 17
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_NEG_Y]])
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_Y]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND_X_NEG_Y]], [[AND_X_Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %and_x_neg_y = and i8 %x, -16
+  %and_x_y = and i8 %x, 17
+  call void @use.i8(i8 %and_x_neg_y)
+  call void @use.i8(i8 %and_x_y)
+  %r = icmp ne i8 %and_x_neg_y, %and_x_y
+  ret i1 %r
+}
+
+define i1 @test_ne_cp2_other_okay(i8 %x, i8 %yy) {
+; CHECK-LABEL: @test_ne_cp2_other_okay(
+; CHECK-NEXT:    [[AND_X_Y:%.*]] = and i8 [[X:%.*]], 16
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_Y]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %and_x_neg_y = and i8 %x, -17
+  %and_x_y = and i8 %x, 16
+  call void @use.i8(i8 %and_x_y)
+  %r = icmp ne i8 %and_x_neg_y, %and_x_y
+  ret i1 %r
+}
+
+define i1 @test_ne_cp2_other_okay2(i8 %x, i8 %yy) {
+; CHECK-LABEL: @test_ne_cp2_other_okay2(
+; CHECK-NEXT:    [[AND_X_Y:%.*]] = and i8 [[X:%.*]], 16
+; CHECK-NEXT:    call void @use.i8(i8 [[AND_X_Y]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %and_x_neg_y = and i8 %x, -17
+  %and_x_y = and i8 %x, 16
+  call void @use.i8(i8 %and_x_y)
+  %r = icmp ne i8 %and_x_y, %and_x_neg_y
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/exp2-1.ll b/llvm/test/Transforms/InstCombine/exp2-1.ll
index 2dff0b0..d8bd0a4 100644
--- a/llvm/test/Transforms/InstCombine/exp2-1.ll
+++ b/llvm/test/Transforms/InstCombine/exp2-1.ll
@@ -242,8 +242,8 @@ define double @test_simplify9(i8 zeroext %x) {
 ; NOLDEXPF-NEXT:    ret double [[RET]]
 ;
 ; NOLDEXP-LABEL: @test_simplify9(
-; NOLDEXP-NEXT:    [[CONV:%.*]] = uitofp i8 [[X:%.*]] to double
-; NOLDEXP-NEXT:    [[RET:%.*]] = call double @llvm.exp2.f64(double [[CONV]])
+; NOLDEXP-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
+; NOLDEXP-NEXT:    [[RET:%.*]] = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 [[TMP1]])
 ; NOLDEXP-NEXT:    ret double [[RET]]
 ;
   %conv = uitofp i8 %x to double
@@ -263,13 +263,13 @@ define float @test_simplify10(i8 zeroext %x) {
 ; LDEXP16-NEXT:    ret float [[RET]]
 ;
 ; NOLDEXPF-LABEL: @test_simplify10(
-; NOLDEXPF-NEXT:    [[CONV:%.*]] = uitofp i8 [[X:%.*]] to float
-; NOLDEXPF-NEXT:    [[RET:%.*]] = call float @llvm.exp2.f32(float [[CONV]])
+; NOLDEXPF-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
+; NOLDEXPF-NEXT:    [[RET:%.*]] = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
 ; NOLDEXPF-NEXT:    ret float [[RET]]
 ;
 ; NOLDEXP-LABEL: @test_simplify10(
-; NOLDEXP-NEXT:    [[CONV:%.*]] = uitofp i8 [[X:%.*]] to float
-; NOLDEXP-NEXT:    [[RET:%.*]] = call float @llvm.exp2.f32(float [[CONV]])
+; NOLDEXP-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
+; NOLDEXP-NEXT:    [[RET:%.*]] = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
 ; NOLDEXP-NEXT:    ret float [[RET]]
 ;
   %conv = uitofp i8 %x to float
@@ -289,13 +289,13 @@ define float @sitofp_scalar_intrinsic_with_FMF(i8 %x) {
 ; LDEXP16-NEXT:    ret float [[R]]
 ;
 ; NOLDEXPF-LABEL: @sitofp_scalar_intrinsic_with_FMF(
-; NOLDEXPF-NEXT:    [[S:%.*]] = sitofp i8 [[X:%.*]] to float
-; NOLDEXPF-NEXT:    [[R:%.*]] = tail call nnan float @llvm.exp2.f32(float [[S]])
+; NOLDEXPF-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
+; NOLDEXPF-NEXT:    [[R:%.*]] = tail call nnan float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
 ; NOLDEXPF-NEXT:    ret float [[R]]
 ;
 ; NOLDEXP-LABEL: @sitofp_scalar_intrinsic_with_FMF(
-; NOLDEXP-NEXT:    [[S:%.*]] = sitofp i8 [[X:%.*]] to float
-; NOLDEXP-NEXT:    [[R:%.*]] = tail call nnan float @llvm.exp2.f32(float [[S]])
+; NOLDEXP-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
+; NOLDEXP-NEXT:    [[R:%.*]] = tail call nnan float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
 ; NOLDEXP-NEXT:    ret float [[R]]
 ;
   %s = sitofp i8 %x to float
@@ -317,9 +317,14 @@ define <2 x float> @sitofp_vector_intrinsic_with_FMF(<2 x i8> %x) {
 ; LDEXP16-NEXT:    [[R:%.*]] = call nnan <2 x float> @llvm.ldexp.v2f32.v2i16(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i16> [[TMP1]])
 ; LDEXP16-NEXT:    ret <2 x float> [[R]]
 ;
+; NOLDEXPF-LABEL: @sitofp_vector_intrinsic_with_FMF(
+; NOLDEXPF-NEXT:    [[TMP1:%.*]] = sext <2 x i8> [[X:%.*]] to <2 x i32>
+; NOLDEXPF-NEXT:    [[R:%.*]] = call nnan <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[TMP1]])
+; NOLDEXPF-NEXT:    ret <2 x float> [[R]]
+;
 ; NOLDEXP-LABEL: @sitofp_vector_intrinsic_with_FMF(
-; NOLDEXP-NEXT:    [[S:%.*]] = sitofp <2 x i8> [[X:%.*]] to <2 x float>
-; NOLDEXP-NEXT:    [[R:%.*]] = call nnan <2 x float> @llvm.exp2.v2f32(<2 x float> [[S]])
+; NOLDEXP-NEXT:    [[TMP1:%.*]] = sext <2 x i8> [[X:%.*]] to <2 x i32>
+; NOLDEXP-NEXT:    [[R:%.*]] = call nnan <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[TMP1]])
 ; NOLDEXP-NEXT:    ret <2 x float> [[R]]
 ;
   %s = sitofp <2 x i8> %x to <2 x float>
diff --git a/llvm/test/Transforms/InstCombine/exp2-to-ldexp.ll b/llvm/test/Transforms/InstCombine/exp2-to-ldexp.ll
index 6e5be5a..9690201 100644
--- a/llvm/test/Transforms/InstCombine/exp2-to-ldexp.ll
+++ b/llvm/test/Transforms/InstCombine/exp2-to-ldexp.ll
@@ -1,19 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes=instcombine %s | FileCheck -check-prefixes=CHECK,LDEXP %s
-; RUN: opt -S -passes=instcombine -disable-builtin=ldexpf -disable-builtin=ldexp -disable-builtin=ldexpl %s | FileCheck -check-prefixes=CHECK,NOLDEXP %s
+; RUN: opt -S -passes=instcombine %s | FileCheck %s
+; RUN: opt -S -passes=instcombine -disable-builtin=ldexpf -disable-builtin=ldexp -disable-builtin=ldexpl %s | FileCheck %s
 
 define float @exp2_f32_sitofp_i8(i8 %x) {
-; LDEXP-LABEL: define float @exp2_f32_sitofp_i8(
-; LDEXP-SAME: i8 [[X:%.*]]) {
-; LDEXP-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
-; LDEXP-NEXT:    [[LDEXPF:%.*]] = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
-; LDEXP-NEXT:    ret float [[LDEXPF]]
-;
-; NOLDEXP-LABEL: define float @exp2_f32_sitofp_i8(
-; NOLDEXP-SAME: i8 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i8 [[X]] to float
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = call float @llvm.exp2.f32(float [[ITOFP]])
-; NOLDEXP-NEXT:    ret float [[EXP2]]
+; CHECK-LABEL: define float @exp2_f32_sitofp_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
+; CHECK-NEXT:    [[EXP2:%.*]] = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %itofp = sitofp i8 %x to float
   %exp2 = call float @llvm.exp2.f32(float %itofp)
@@ -21,17 +15,11 @@ define float @exp2_f32_sitofp_i8(i8 %x) {
 }
 
 define float @exp2_f32_sitofp_i8_flags(i8 %x) {
-; LDEXP-LABEL: define float @exp2_f32_sitofp_i8_flags(
-; LDEXP-SAME: i8 [[X:%.*]]) {
-; LDEXP-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
-; LDEXP-NEXT:    [[LDEXPF:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
-; LDEXP-NEXT:    ret float [[LDEXPF]]
-;
-; NOLDEXP-LABEL: define float @exp2_f32_sitofp_i8_flags(
-; NOLDEXP-SAME: i8 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i8 [[X]] to float
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = call nnan ninf float @llvm.exp2.f32(float [[ITOFP]])
-; NOLDEXP-NEXT:    ret float [[EXP2]]
+; CHECK-LABEL: define float @exp2_f32_sitofp_i8_flags(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
+; CHECK-NEXT:    [[EXP2:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %itofp = sitofp i8 %x to float
   %exp2 = call nnan ninf float @llvm.exp2.f32(float %itofp)
@@ -39,17 +27,11 @@ define float @exp2_f32_sitofp_i8_flags(i8 %x) {
 }
 
 define <2 x float> @exp2_v2f32_sitofp_v2i8(<2 x i8> %x) {
-; LDEXP-LABEL: define <2 x float> @exp2_v2f32_sitofp_v2i8(
-; LDEXP-SAME: <2 x i8> [[X:%.*]]) {
-; LDEXP-NEXT:    [[TMP1:%.*]] = sext <2 x i8> [[X]] to <2 x i32>
-; LDEXP-NEXT:    [[EXP2:%.*]] = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[TMP1]])
-; LDEXP-NEXT:    ret <2 x float> [[EXP2]]
-;
-; NOLDEXP-LABEL: define <2 x float> @exp2_v2f32_sitofp_v2i8(
-; NOLDEXP-SAME: <2 x i8> [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i8> [[X]] to <2 x float>
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = call <2 x float> @llvm.exp2.v2f32(<2 x float> [[ITOFP]])
-; NOLDEXP-NEXT:    ret <2 x float> [[EXP2]]
+; CHECK-LABEL: define <2 x float> @exp2_v2f32_sitofp_v2i8(
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i8> [[X]] to <2 x i32>
+; CHECK-NEXT:    [[EXP2:%.*]] = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %itofp = sitofp <2 x i8> %x to <2 x float>
   %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %itofp)
@@ -57,17 +39,11 @@ define <2 x float> @exp2_v2f32_sitofp_v2i8(<2 x i8> %x) {
 }
 
 define float @exp2_f32_uitofp_i8(i8 %x) {
-; LDEXP-LABEL: define float @exp2_f32_uitofp_i8(
-; LDEXP-SAME: i8 [[X:%.*]]) {
-; LDEXP-NEXT:    [[TMP1:%.*]] = zext i8 [[X]] to i32
-; LDEXP-NEXT:    [[LDEXPF:%.*]] = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
-; LDEXP-NEXT:    ret float [[LDEXPF]]
-;
-; NOLDEXP-LABEL: define float @exp2_f32_uitofp_i8(
-; NOLDEXP-SAME: i8 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = uitofp i8 [[X]] to float
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = call float @llvm.exp2.f32(float [[ITOFP]])
-; NOLDEXP-NEXT:    ret float [[EXP2]]
+; CHECK-LABEL: define float @exp2_f32_uitofp_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    [[EXP2:%.*]] = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %itofp = uitofp i8 %x to float
   %exp2 = call float @llvm.exp2.f32(float %itofp)
@@ -77,8 +53,8 @@ define float @exp2_f32_uitofp_i8(i8 %x) {
 define half @exp2_f16_sitofp_i8(i8 %x) {
 ; CHECK-LABEL: define half @exp2_f16_sitofp_i8(
 ; CHECK-SAME: i8 [[X:%.*]]) {
-; CHECK-NEXT:    [[ITOFP:%.*]] = sitofp i8 [[X]] to half
-; CHECK-NEXT:    [[EXP2:%.*]] = call half @llvm.exp2.f16(half [[ITOFP]])
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
+; CHECK-NEXT:    [[EXP2:%.*]] = call half @llvm.ldexp.f16.i32(half 0xH3C00, i32 [[TMP1]])
 ; CHECK-NEXT:    ret half [[EXP2]]
 ;
   %itofp = sitofp i8 %x to half
@@ -87,17 +63,11 @@ define half @exp2_f16_sitofp_i8(i8 %x) {
 }
 
 define double @exp2_f64_sitofp_i8(i8 %x) {
-; LDEXP-LABEL: define double @exp2_f64_sitofp_i8(
-; LDEXP-SAME: i8 [[X:%.*]]) {
-; LDEXP-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
-; LDEXP-NEXT:    [[LDEXP:%.*]] = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 [[TMP1]])
-; LDEXP-NEXT:    ret double [[LDEXP]]
-;
-; NOLDEXP-LABEL: define double @exp2_f64_sitofp_i8(
-; NOLDEXP-SAME: i8 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i8 [[X]] to double
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = call double @llvm.exp2.f64(double [[ITOFP]])
-; NOLDEXP-NEXT:    ret double [[EXP2]]
+; CHECK-LABEL: define double @exp2_f64_sitofp_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
+; CHECK-NEXT:    [[EXP2:%.*]] = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 [[TMP1]])
+; CHECK-NEXT:    ret double [[EXP2]]
 ;
   %itofp = sitofp i8 %x to double
   %exp2 = call double @llvm.exp2.f64(double %itofp)
@@ -105,17 +75,11 @@ define double @exp2_f64_sitofp_i8(i8 %x) {
 }
 
 define fp128 @exp2_fp128_sitofp_i8(i8 %x) {
-; LDEXP-LABEL: define fp128 @exp2_fp128_sitofp_i8(
-; LDEXP-SAME: i8 [[X:%.*]]) {
-; LDEXP-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
-; LDEXP-NEXT:    [[LDEXPL:%.*]] = call fp128 @llvm.ldexp.f128.i32(fp128 0xL00000000000000003FFF000000000000, i32 [[TMP1]])
-; LDEXP-NEXT:    ret fp128 [[LDEXPL]]
-;
-; NOLDEXP-LABEL: define fp128 @exp2_fp128_sitofp_i8(
-; NOLDEXP-SAME: i8 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i8 [[X]] to fp128
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = call fp128 @llvm.exp2.f128(fp128 [[ITOFP]])
-; NOLDEXP-NEXT:    ret fp128 [[EXP2]]
+; CHECK-LABEL: define fp128 @exp2_fp128_sitofp_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X]] to i32
+; CHECK-NEXT:    [[EXP2:%.*]] = call fp128 @llvm.ldexp.f128.i32(fp128 0xL00000000000000003FFF000000000000, i32 [[TMP1]])
+; CHECK-NEXT:    ret fp128 [[EXP2]]
 ;
   %itofp = sitofp i8 %x to fp128
   %exp2 = call fp128 @llvm.exp2.fp128(fp128 %itofp)
@@ -123,17 +87,11 @@ define fp128 @exp2_fp128_sitofp_i8(i8 %x) {
 }
 
 define <vscale x 4 x float> @exp2_nxv4f32_sitofp_i8(<vscale x 4 x i8> %x) {
-; LDEXP-LABEL: define <vscale x 4 x float> @exp2_nxv4f32_sitofp_i8(
-; LDEXP-SAME: <vscale x 4 x i8> [[X:%.*]]) {
-; LDEXP-NEXT:    [[TMP1:%.*]] = sext <vscale x 4 x i8> [[X]] to <vscale x 4 x i32>
-; LDEXP-NEXT:    [[EXP2:%.*]] = call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[TMP1]])
-; LDEXP-NEXT:    ret <vscale x 4 x float> [[EXP2]]
-;
-; NOLDEXP-LABEL: define <vscale x 4 x float> @exp2_nxv4f32_sitofp_i8(
-; NOLDEXP-SAME: <vscale x 4 x i8> [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp <vscale x 4 x i8> [[X]] to <vscale x 4 x float>
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[ITOFP]])
-; NOLDEXP-NEXT:    ret <vscale x 4 x float> [[EXP2]]
+; CHECK-LABEL: define <vscale x 4 x float> @exp2_nxv4f32_sitofp_i8(
+; CHECK-SAME: <vscale x 4 x i8> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <vscale x 4 x i8> [[X]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[EXP2:%.*]] = call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[EXP2]]
 ;
   %itofp = sitofp <vscale x 4 x i8> %x to <vscale x 4 x float>
   %exp2 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %itofp)
diff --git a/llvm/test/Transforms/InstCombine/gepgep.ll b/llvm/test/Transforms/InstCombine/gepgep.ll
index d2a0e1d..6b6a159 100644
--- a/llvm/test/Transforms/InstCombine/gepgep.ll
+++ b/llvm/test/Transforms/InstCombine/gepgep.ll
@@ -10,7 +10,7 @@ declare void @use(ptr)
 
 define void @f() {
 ; CHECK-LABEL: define void @f() {
-; CHECK-NEXT:    call void @use(ptr getelementptr (i8, ptr @buffer, i64 add (i64 sub (i64 0, i64 ptrtoint (ptr @buffer to i64)), i64 127)))
+; CHECK-NEXT:    call void @use(ptr getelementptr (i8, ptr getelementptr (i8, ptr @buffer, i64 add (i64 sub (i64 0, i64 ptrtoint (ptr @buffer to i64)), i64 63)), i64 64))
 ; CHECK-NEXT:    ret void
 ;
   call void @use(ptr getelementptr (i8, ptr getelementptr (i8, ptr @buffer, i64 add (i64 sub (i64 0, i64 ptrtoint (ptr @buffer to i64)), i64 63)), i64 64))
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index d5cfb74..8719136 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -1716,10 +1716,9 @@ if.else:
 
 @g = external global i8
 
-; FIXME: This is a miscompile
 define ptr @constexpr_gep_of_gep_with_narrow_type() {
 ; CHECK-LABEL: @constexpr_gep_of_gep_with_narrow_type(
-; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @g, i64 -2)
+; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @g, i64 254)
 ;
   ret ptr getelementptr (i8, ptr getelementptr (i8, ptr @g, i8 127), i8 127)
 }
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 4dbe2fc..8fc4a40 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -5255,6 +5255,16 @@ define i1 @test_icmp_shl_nuw(i64 %x) {
   ret i1 %cmp
 }
 
+define i1 @test_icmp_shl_nuw_i31(i31 %x) {
+; CHECK-LABEL: @test_icmp_shl_nuw_i31(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i31 [[X:%.*]], 250
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nuw i31 %x, 23
+  %cmp = icmp ugt i31 %shl, -50331648
+  ret i1 %cmp
+}
+
 define i1 @test_icmp_shl_nsw(i64 %x) {
 ; CHECK-LABEL: @test_icmp_shl_nsw(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 3
@@ -5265,6 +5275,17 @@ define i1 @test_icmp_shl_nsw(i64 %x) {
   ret i1 %cmp
 }
 
+define i1 @test_icmp_shl_nsw_i31(i31 %x) {
+; CHECK-LABEL: @test_icmp_shl_nsw_i31(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i31 [[X:%.*]] to i8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[TMP1]], -6
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i31 %x, 23
+  %cmp = icmp ugt i31 %shl, -50331648
+  ret i1 %cmp
+}
+
 define <2 x i1> @test_icmp_shl_vec(<2 x i64> %x) {
 ; CHECK-LABEL: @test_icmp_shl_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[X:%.*]] to <2 x i32>
@@ -5295,3 +5316,13 @@ define i1 @test_icmp_shl_sgt(i64 %x) {
   %cmp = icmp sgt i64 %shl, 8589934591
   ret i1 %cmp
 }
+
+define i1 @pr94897(i32 range(i32 -2147483648, 0) %x) {
+; CHECK-LABEL: @pr94897(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], -3
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i32 %x, 24
+  %cmp = icmp ugt i32 %shl, -50331648
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/ldexp-zext.ll b/llvm/test/Transforms/InstCombine/ldexp-zext.ll
new file mode 100644
index 0000000..b6e4f12
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ldexp-zext.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define float @ldexp_zext_float(float %x, i1 %bool) {
+; CHECK-LABEL: @ldexp_zext_float(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[BOOL:%.*]], float 2.000000e+00, float 1.000000e+00
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul float [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret float [[LDEXP]]
+;
+  %zext = zext i1 %bool to i32
+  %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %zext)
+  ret float %ldexp
+}
+
+define float @ldexp_zext_float_negative(float %x, i8 %y) {
+; CHECK-LABEL: @ldexp_zext_float_negative(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[LDEXP:%.*]] = call float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[ZEXT]])
+; CHECK-NEXT:    ret float [[LDEXP]]
+;
+  %zext = zext i8 %y to i32
+  %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %zext)
+  ret float %ldexp
+}
+
+define double @ldexp_zext_double(double %x, i1 %bool) {
+; CHECK-LABEL: @ldexp_zext_double(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 2.000000e+00, double 1.000000e+00
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul double [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret double [[LDEXP]]
+;
+  %zext = zext i1 %bool to i32
+  %ldexp = call double @llvm.ldexp.f64.i32(double %x, i32 %zext)
+  ret double %ldexp
+}
+
+define double @ldexp_zext_double_fast_math(double %x, i1 %bool) {
+; CHECK-LABEL: @ldexp_zext_double_fast_math(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 2.000000e+00, double 1.000000e+00
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul reassoc double [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret double [[LDEXP]]
+;
+  %zext = zext i1 %bool to i32
+  %ldexp = call reassoc double @llvm.ldexp.f64.i32(double %x, i32 %zext)
+  ret double %ldexp
+}
+
+define <2 x float> @ldexp_zext_float_vector(<2 x float> %x, <2 x i1> %bool) {
+; CHECK-LABEL: @ldexp_zext_float_vector(
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[BOOL:%.*]], <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul <2 x float> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[LDEXP]]
+;
+  %zext = zext <2 x i1> %bool to <2 x i32>
+  %ldexp = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> %zext)
+  ret <2 x float> %ldexp
+}
diff --git a/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll b/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
index b61f880..cb51e92 100644
--- a/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
+++ b/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
@@ -5,16 +5,10 @@
 
 
 define float @pow_sitofp_f32_const_base_2(i32 %x) {
-; LDEXP-LABEL: define float @pow_sitofp_f32_const_base_2(
-; LDEXP-SAME: i32 [[X:%.*]]) {
-; LDEXP-NEXT:    [[LDEXPF:%.*]] = tail call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[X]])
-; LDEXP-NEXT:    ret float [[LDEXPF]]
-;
-; NOLDEXP-LABEL: define float @pow_sitofp_f32_const_base_2(
-; NOLDEXP-SAME: i32 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i32 [[X]] to float
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call float @llvm.exp2.f32(float [[ITOFP]])
-; NOLDEXP-NEXT:    ret float [[POW]]
+; CHECK-LABEL: define float @pow_sitofp_f32_const_base_2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[X]])
+; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %itofp = sitofp i32 %x to float
   %pow = tail call float @llvm.pow.f32(float 2.000000e+00, float %itofp)
@@ -22,16 +16,10 @@ define float @pow_sitofp_f32_const_base_2(i32 %x) {
 }
 
 define float @pow_sitofp_f32_const_base_2__flags(i32 %x) {
-; LDEXP-LABEL: define float @pow_sitofp_f32_const_base_2__flags(
-; LDEXP-SAME: i32 [[X:%.*]]) {
-; LDEXP-NEXT:    [[LDEXPF:%.*]] = tail call nnan nsz float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[X]])
-; LDEXP-NEXT:    ret float [[LDEXPF]]
-;
-; NOLDEXP-LABEL: define float @pow_sitofp_f32_const_base_2__flags(
-; NOLDEXP-SAME: i32 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i32 [[X]] to float
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = tail call nnan nsz float @llvm.exp2.f32(float [[ITOFP]])
-; NOLDEXP-NEXT:    ret float [[EXP2]]
+; CHECK-LABEL: define float @pow_sitofp_f32_const_base_2__flags(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan nsz float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[X]])
+; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %itofp = sitofp i32 %x to float
   %pow = tail call nsz nnan float @llvm.pow.f32(float 2.000000e+00, float %itofp)
@@ -115,16 +103,10 @@ define float @pow_sitofp_f32_const_base_16(i32 %x) {
 }
 
 define double @pow_sitofp_f64_const_base_2(i32 %x) {
-; LDEXP-LABEL: define double @pow_sitofp_f64_const_base_2(
-; LDEXP-SAME: i32 [[X:%.*]]) {
-; LDEXP-NEXT:    [[LDEXP:%.*]] = tail call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 [[X]])
-; LDEXP-NEXT:    ret double [[LDEXP]]
-;
-; NOLDEXP-LABEL: define double @pow_sitofp_f64_const_base_2(
-; NOLDEXP-SAME: i32 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i32 [[X]] to double
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call double @llvm.exp2.f64(double [[ITOFP]])
-; NOLDEXP-NEXT:    ret double [[POW]]
+; CHECK-LABEL: define double @pow_sitofp_f64_const_base_2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 [[X]])
+; CHECK-NEXT:    ret double [[EXP2]]
 ;
   %itofp = sitofp i32 %x to double
   %pow = tail call double @llvm.pow.f64(double 2.000000e+00, double %itofp)
@@ -144,16 +126,10 @@ define half @pow_sitofp_f16_const_base_2(i32 %x) {
 }
 
 define <2 x float> @pow_sitofp_v2f32_const_base_2(<2 x i32> %x) {
-; LDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
-; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-NEXT:    ret <2 x float> [[EXP2]]
-;
-; NOLDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
-; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x float>
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call <2 x float> @llvm.exp2.v2f32(<2 x float> [[ITOFP]])
-; NOLDEXP-NEXT:    ret <2 x float> [[POW]]
+; CHECK-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
+; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %itofp = sitofp <2 x i32> %x to <2 x float>
   %pow = tail call <2 x float> @llvm.pow.v2f32(<2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> %itofp)
@@ -199,16 +175,10 @@ define <2 x float> @pow_sitofp_v2f32_const_base_mixed_2(<2 x i32> %x) {
 }
 
 define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(<2 x i32> %x) {
-; LDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
-; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NEXT:    [[EXP2:%.*]] = tail call nsz afn <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-NEXT:    ret <2 x float> [[EXP2]]
-;
-; NOLDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
-; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x float>
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call nsz afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[ITOFP]])
-; NOLDEXP-NEXT:    ret <2 x float> [[POW]]
+; CHECK-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call nsz afn <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
+; CHECK-NEXT:    ret <2 x float> [[EXP2]]
 ;
   %itofp = sitofp <2 x i32> %x to <2 x float>
   %pow = tail call nsz afn <2 x float> @llvm.pow.v2f32(<2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> %itofp)
@@ -216,16 +186,10 @@ define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(<2 x i32> %x) {
 }
 
 define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(<vscale x 4 x i32> %x) {
-; LDEXP-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
-; LDEXP-SAME: <vscale x 4 x i32> [[X:%.*]]) {
-; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[X]])
-; LDEXP-NEXT:    ret <vscale x 4 x float> [[EXP2]]
-;
-; NOLDEXP-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
-; NOLDEXP-SAME: <vscale x 4 x i32> [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp <vscale x 4 x i32> [[X]] to <vscale x 4 x float>
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[ITOFP]])
-; NOLDEXP-NEXT:    ret <vscale x 4 x float> [[POW]]
+; CHECK-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
+; CHECK-SAME: <vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[X]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[EXP2]]
 ;
   %itofp = sitofp <vscale x 4 x i32> %x to <vscale x 4 x float>
   %pow = tail call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> splat (float 2.0), <vscale x 4 x float> %itofp)
@@ -233,16 +197,10 @@ define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(<vscale x 4 x i32>
 }
 
 define <2 x half> @pow_sitofp_v2f16_const_base_2(<2 x i32> %x) {
-; LDEXP-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
-; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> <half 0xH3C00, half 0xH3C00>, <2 x i32> [[X]])
-; LDEXP-NEXT:    ret <2 x half> [[EXP2]]
-;
-; NOLDEXP-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
-; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x half>
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x half> @llvm.exp2.v2f16(<2 x half> [[ITOFP]])
-; NOLDEXP-NEXT:    ret <2 x half> [[EXP2]]
+; CHECK-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> <half 0xH3C00, half 0xH3C00>, <2 x i32> [[X]])
+; CHECK-NEXT:    ret <2 x half> [[EXP2]]
 ;
   %itofp = sitofp <2 x i32> %x to <2 x half>
   %pow = tail call <2 x half> @llvm.pow.v2f16(<2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> %itofp)
@@ -250,16 +208,10 @@ define <2 x half> @pow_sitofp_v2f16_const_base_2(<2 x i32> %x) {
 }
 
 define <2 x double> @pow_sitofp_v2f64_const_base_2(<2 x i32> %x) {
-; LDEXP-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
-; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-NEXT:    ret <2 x double> [[EXP2]]
-;
-; NOLDEXP-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
-; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x double>
-; NOLDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x double> @llvm.exp2.v2f64(<2 x double> [[ITOFP]])
-; NOLDEXP-NEXT:    ret <2 x double> [[EXP2]]
+; CHECK-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> [[X]])
+; CHECK-NEXT:    ret <2 x double> [[EXP2]]
 ;
   %itofp = sitofp <2 x i32> %x to <2 x double>
   %pow = tail call <2 x double> @llvm.pow.v2f64(<2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> %itofp)
@@ -333,16 +285,10 @@ define <2 x double> @pow_sitofp_v2f64_const_base_8(<2 x i32> %x) {
 }
 
 define fp128 @pow_sitofp_fp128_const_base_2(i32 %x) {
-; LDEXP-LABEL: define fp128 @pow_sitofp_fp128_const_base_2(
-; LDEXP-SAME: i32 [[X:%.*]]) {
-; LDEXP-NEXT:    [[LDEXPL:%.*]] = tail call fp128 @llvm.ldexp.f128.i32(fp128 0xL00000000000000003FFF000000000000, i32 [[X]])
-; LDEXP-NEXT:    ret fp128 [[LDEXPL]]
-;
-; NOLDEXP-LABEL: define fp128 @pow_sitofp_fp128_const_base_2(
-; NOLDEXP-SAME: i32 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i32 [[X]] to fp128
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call fp128 @llvm.exp2.f128(fp128 [[ITOFP]])
-; NOLDEXP-NEXT:    ret fp128 [[POW]]
+; CHECK-LABEL: define fp128 @pow_sitofp_fp128_const_base_2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call fp128 @llvm.ldexp.f128.i32(fp128 0xL00000000000000003FFF000000000000, i32 [[X]])
+; CHECK-NEXT:    ret fp128 [[EXP2]]
 ;
   %itofp = sitofp i32 %x to fp128
   %pow = tail call fp128 @llvm.pow.fp128(fp128 0xL00000000000000004000000000000000, fp128 %itofp)
@@ -412,16 +358,10 @@ define float @libcall_powf_sitofp_f32_const_base_2__flags(i32 %x) {
 }
 
 define float @readnone_libcall_powf_sitofp_f32_const_base_2(i32 %x) {
-; LDEXP-LABEL: define float @readnone_libcall_powf_sitofp_f32_const_base_2(
-; LDEXP-SAME: i32 [[X:%.*]]) {
-; LDEXP-NEXT:    [[LDEXPF:%.*]] = tail call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[X]])
-; LDEXP-NEXT:    ret float [[LDEXPF]]
-;
-; NOLDEXP-LABEL: define float @readnone_libcall_powf_sitofp_f32_const_base_2(
-; NOLDEXP-SAME: i32 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i32 [[X]] to float
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call float @llvm.exp2.f32(float [[ITOFP]])
-; NOLDEXP-NEXT:    ret float [[POW]]
+; CHECK-LABEL: define float @readnone_libcall_powf_sitofp_f32_const_base_2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 [[X]])
+; CHECK-NEXT:    ret float [[EXP2]]
 ;
   %itofp = sitofp i32 %x to float
   %pow = tail call float @powf(float 2.000000e+00, float %itofp) memory(none)
@@ -429,16 +369,10 @@ define float @readnone_libcall_powf_sitofp_f32_const_base_2(i32 %x) {
 }
 
 define double @readnone_libcall_pow_sitofp_f32_const_base_2(i32 %x) {
-; LDEXP-LABEL: define double @readnone_libcall_pow_sitofp_f32_const_base_2(
-; LDEXP-SAME: i32 [[X:%.*]]) {
-; LDEXP-NEXT:    [[LDEXP:%.*]] = tail call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 [[X]])
-; LDEXP-NEXT:    ret double [[LDEXP]]
-;
-; NOLDEXP-LABEL: define double @readnone_libcall_pow_sitofp_f32_const_base_2(
-; NOLDEXP-SAME: i32 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i32 [[X]] to double
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call double @llvm.exp2.f64(double [[ITOFP]])
-; NOLDEXP-NEXT:    ret double [[POW]]
+; CHECK-LABEL: define double @readnone_libcall_pow_sitofp_f32_const_base_2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 [[X]])
+; CHECK-NEXT:    ret double [[EXP2]]
 ;
   %itofp = sitofp i32 %x to double
   %pow = tail call double @pow(double 2.000000e+00, double %itofp) memory(none)
@@ -446,16 +380,10 @@ define double @readnone_libcall_pow_sitofp_f32_const_base_2(i32 %x) {
 }
 
 define fp128 @readnone_libcall_powl_sitofp_fp128_const_base_2(i32 %x) {
-; LDEXP-LABEL: define fp128 @readnone_libcall_powl_sitofp_fp128_const_base_2(
-; LDEXP-SAME: i32 [[X:%.*]]) {
-; LDEXP-NEXT:    [[LDEXPL:%.*]] = tail call fp128 @llvm.ldexp.f128.i32(fp128 0xL00000000000000003FFF000000000000, i32 [[X]])
-; LDEXP-NEXT:    ret fp128 [[LDEXPL]]
-;
-; NOLDEXP-LABEL: define fp128 @readnone_libcall_powl_sitofp_fp128_const_base_2(
-; NOLDEXP-SAME: i32 [[X:%.*]]) {
-; NOLDEXP-NEXT:    [[ITOFP:%.*]] = sitofp i32 [[X]] to fp128
-; NOLDEXP-NEXT:    [[POW:%.*]] = tail call fp128 @llvm.exp2.f128(fp128 [[ITOFP]])
-; NOLDEXP-NEXT:    ret fp128 [[POW]]
+; CHECK-LABEL: define fp128 @readnone_libcall_powl_sitofp_fp128_const_base_2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[EXP2:%.*]] = tail call fp128 @llvm.ldexp.f128.i32(fp128 0xL00000000000000003FFF000000000000, i32 [[X]])
+; CHECK-NEXT:    ret fp128 [[EXP2]]
 ;
   %itofp = sitofp i32 %x to fp128
   %pow = tail call fp128 @powl(fp128 0xL00000000000000004000000000000000, fp128 %itofp) memory(none)
diff --git a/llvm/test/Transforms/InstCombine/ptrtoint-nullgep.ll b/llvm/test/Transforms/InstCombine/ptrtoint-nullgep.ll
index e6e6377..9d6f0ab 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoint-nullgep.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoint-nullgep.ll
@@ -68,10 +68,10 @@ define void @constant_fold_ptrtoint_of_gep_of_nullgep() {
 ; LLPARSER-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) null, i64 1234) to i64))
 ; LLPARSER-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 1234) to i64))
 ; LLPARSER-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 1234) to i64))
-; LLPARSER-NEXT:    call void @use_i64(i64 0)
-; LLPARSER-NEXT:    call void @use_i64(i64 0)
-; LLPARSER-NEXT:    call void @use_i64(i64 0)
-; LLPARSER-NEXT:    call void @use_i64(i64 0)
+; LLPARSER-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
+; LLPARSER-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
+; LLPARSER-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
+; LLPARSER-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
 ; LLPARSER-NEXT:    ret void
 ;
 ; INSTSIMPLIFY-LABEL: define {{[^@]+}}@constant_fold_ptrtoint_of_gep_of_nullgep() {
@@ -83,10 +83,10 @@ define void @constant_fold_ptrtoint_of_gep_of_nullgep() {
 ; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) null, i64 1234) to i64))
 ; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 1234) to i64))
 ; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 1234) to i64))
-; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 0)
-; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 0)
-; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 0)
-; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 0)
+; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
+; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
+; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
+; INSTSIMPLIFY-NEXT:    call void @use_i64(i64 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i64 -1), i64 1) to i64))
 ; INSTSIMPLIFY-NEXT:    ret void
 ;
 ; INSTCOMBINE-LABEL: define {{[^@]+}}@constant_fold_ptrtoint_of_gep_of_nullgep() {
diff --git a/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll b/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll
index 729ca03..e4dd2d1 100644
--- a/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll
@@ -125,7 +125,7 @@ define { i32, i1 } @fold_sub_simple(i32 %x) {
 
 define { i32, i1 } @fold_with_distjoin_or(i32 %x) {
 ; CHECK-LABEL: @fold_with_distjoin_or(
-; CHECK-NEXT:    [[B:%.*]] = add i32 [[X:%.*]], 6
+; CHECK-NEXT:    [[B:%.*]] = add nsw i32 [[X:%.*]], 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i32, i1 } { i32 poison, i1 false }, i32 [[B]], 0
 ; CHECK-NEXT:    ret { i32, i1 } [[TMP1]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 8da52e0..0700d7a 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -2240,4 +2240,84 @@ define i129 @shift_zext_not_nneg(i8 %arg) {
   ret i129 %shl
 }
 
+define i8 @src_shl_nsw(i8 %x) {
+; CHECK-LABEL: @src_shl_nsw(
+; CHECK-NEXT:    [[R:%.*]] = shl nsw i8 32, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = shl nsw i8 1, %x
+  %r = shl nsw i8 %sh, 5
+  ret i8 %r
+}
+
+define i8 @src_shl_nsw_fail(i8 %x) {
+; CHECK-LABEL: @src_shl_nsw_fail(
+; CHECK-NEXT:    [[R:%.*]] = shl i8 32, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = shl nsw i8 1, %x
+  %r = shl i8 %sh, 5
+  ret i8 %r
+}
+
+define i8 @src_shl_nuw(i8 %x) {
+; CHECK-LABEL: @src_shl_nuw(
+; CHECK-NEXT:    [[R:%.*]] = shl nuw i8 12, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = shl nuw i8 3, %x
+  %r = shl nuw i8 %sh, 2
+  ret i8 %r
+}
+
+define i8 @src_shl_nuw_fail(i8 %x) {
+; CHECK-LABEL: @src_shl_nuw_fail(
+; CHECK-NEXT:    [[R:%.*]] = shl i8 12, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = shl i8 3, %x
+  %r = shl nuw i8 %sh, 2
+  ret i8 %r
+}
+
+define i8 @src_lshr_exact(i8 %x) {
+; CHECK-LABEL: @src_lshr_exact(
+; CHECK-NEXT:    [[R:%.*]] = lshr exact i8 48, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = lshr exact i8 96, %x
+  %r = lshr exact i8 %sh, 1
+  ret i8 %r
+}
+
+define i8 @src_lshr_exact_fail(i8 %x) {
+; CHECK-LABEL: @src_lshr_exact_fail(
+; CHECK-NEXT:    [[R:%.*]] = lshr i8 48, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = lshr exact i8 96, %x
+  %r = lshr i8 %sh, 1
+  ret i8 %r
+}
+
+define i8 @src_ashr_exact(i8 %x) {
+; CHECK-LABEL: @src_ashr_exact(
+; CHECK-NEXT:    [[R:%.*]] = ashr exact i8 -8, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = ashr exact i8 -32, %x
+  %r = ashr exact i8 %sh, 2
+  ret i8 %r
+}
+
+define i8 @src_ashr_exact_fail(i8 %x) {
+; CHECK-LABEL: @src_ashr_exact_fail(
+; CHECK-NEXT:    [[R:%.*]] = ashr i8 -8, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sh = ashr i8 -32, %x
+  %r = ashr exact i8 %sh, 2
+  ret i8 %r
+}
+
 declare i16 @llvm.umax.i16(i16, i16)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 32dc2ec..56616d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -120,8 +120,6 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP9]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[VECTOR_RECUR]], <2 x i64> [[TMP8]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -185,8 +183,6 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index b746303..fae23b6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -776,7 +776,6 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP4]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <16 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP1]]
@@ -860,7 +859,6 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP6]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <16 x i32> [[TMP6]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
index 809d2e8..34d5db4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
@@ -54,7 +54,6 @@ define i32 @pr70988() {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX_NEXT]], [[UMAX]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT7]] = icmp ult i64 [[TMP19]], [[UMAX]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT7]], true
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP17]], i32 [[TMP18]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index bcf8096..52d62b3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -481,9 +481,6 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP76]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP79]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP80:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP81:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT12]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP82:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT13]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP83:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT14]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP84:%.*]] = extractelement <vscale x 8 x i1> [[TMP80]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
@@ -1799,9 +1796,6 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP101:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT18]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP102:%.*]] = extractelement <vscale x 8 x i1> [[TMP98]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
@@ -2198,9 +2192,6 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP101:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT18]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP102:%.*]] = extractelement <vscale x 8 x i1> [[TMP98]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 2acc1dd..52a4b17 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -96,9 +96,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP73:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT11]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP74:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP75:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <vscale x 4 x i1> [[TMP72]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -249,9 +246,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP94:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP95:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT14]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT15]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP97:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <vscale x 4 x i1> [[TMP94]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 64f3b1c..9210aa3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -233,7 +233,7 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32
 ; CHECK-NEXT:    [[ADD4]] = fadd fast float [[ADD]], [[T2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T0]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
@@ -271,3 +271,212 @@ for.end:
   ret float %s.0.lcssa
 }
 
+define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
+; CHECK-LABEL: @multi_exit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[B:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX6]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 30
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[UMAX]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i64 [[TMP3]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[UMIN]], 4294967295
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[UMIN]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 1, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[UMIN]], 4294967295
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC_1:%.*]], i64 8
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC_2:%.*]], i64 8
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC_1]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC_2]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP12]]
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META4:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META7:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT13]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT14]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT14]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = and <2 x i1> [[TMP17]], [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <2 x i1> [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i1> [[TMP19]] to <2 x i8>
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <2 x i1> [[TMP20]] to <2 x i8>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i8> [[TMP22]], i32 1
+; CHECK-NEXT:    store i8 [[TMP23]], ptr [[DST]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_1_WIDE:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT_WIDE:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[EC_1:%.*]] = icmp ult i64 [[IV_1_WIDE]], [[A]]
+; CHECK-NEXT:    br i1 [[EC_1]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[SRC_2]], align 8
+; CHECK-NEXT:    [[CMP55_US:%.*]] = icmp eq i64 [[L_1]], 0
+; CHECK-NEXT:    [[CMP_I_US:%.*]] = icmp ne i64 [[L_2]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP_I_US]], [[CMP55_US]]
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[AND]] to i8
+; CHECK-NEXT:    store i8 [[EXT]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; CHECK-NEXT:    [[IV_1_NEXT_WIDE]] = zext i32 [[IV_1_NEXT]] to i64
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp ult i64 [[IV_1_NEXT_WIDE]], [[B]]
+; CHECK-NEXT:    br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.1.wide = phi i64 [ 0, %entry ], [ %iv.1.next.wide, %loop.latch ]
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.latch ]
+  %ec.1 = icmp ult i64 %iv.1.wide, %A
+  br i1 %ec.1, label %loop.latch, label %exit
+
+loop.latch:
+  %l.1 = load i64, ptr %src.1, align 8
+  %l.2 = load i64, ptr %src.2, align 8
+  %cmp55.us = icmp eq i64 %l.1, 0
+  %cmp.i.us = icmp ne i64 %l.2, 0
+  %and = and i1 %cmp.i.us, %cmp55.us
+  %ext = zext i1 %and to i8
+  store i8 %ext, ptr %dst, align 1
+  %iv.1.next = add i32 %iv.1, 1
+  %iv.1.next.wide = zext i32 %iv.1.next to i64
+  %ec.2 = icmp ult i64 %iv.1.next.wide, %B
+  br i1 %ec.2, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define i1 @any_of_cost(ptr %start, ptr %end) #0 {
+; CHECK-LABEL: @any_of_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START:%.*]] to i64
+; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], 40
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 40
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 40
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 40
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 80
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 120
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 8
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x ptr> [[TMP20]], ptr [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq <2 x ptr> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq <2 x ptr> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[TMP22]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP23]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP26]] = or <2 x i1> [[VEC_PHI]], [[TMP24]]
+; CHECK-NEXT:    [[TMP27]] = or <2 x i1> [[VEC_PHI3]], [[TMP25]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <2 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP30:%.*]] = freeze i1 [[TMP29]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP30]], i1 false, i1 false
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[ANY_OF:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANY_OF_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 8
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[CMP13_NOT_NOT:%.*]] = icmp eq ptr [[L]], null
+; CHECK-NEXT:    [[ANY_OF_NEXT]] = select i1 [[CMP13_NOT_NOT]], i1 [[ANY_OF]], i1 false
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 40
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ANY_OF_NEXT_LCSSA:%.*]] = phi i1 [ [[ANY_OF_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i1 [[ANY_OF_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %any.of = phi i1 [ false, %entry ], [ %any.of.next, %loop ]
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+  %gep = getelementptr i8, ptr %ptr.iv, i64 8
+  %l = load ptr, ptr %gep, align 8
+  %cmp13.not.not = icmp eq ptr %l, null
+  %any.of.next = select i1 %cmp13.not.not, i1 %any.of, i1 false
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 40
+  %cmp.not = icmp eq ptr %ptr.iv, %end
+  br i1 %cmp.not, label %exit, label %loop
+
+exit:
+  ret i1 %any.of.next
+}
+
+attributes #0 = { "target-cpu"="penryn" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 9457500..d58b4f5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -234,8 +234,6 @@ define i64 @test_pr62954_scalar_epilogue_required(ptr %A, ptr noalias %B, ptr %C
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw <16 x i64> zeroinitializer, [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1]] = sub nsw <16 x i64> zeroinitializer, [[STEP_ADD]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i64> [[VECTOR_RECUR]], <16 x i64> [[TMP0]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i64> [[TMP0]], <16 x i64> [[TMP1]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i64> [[TMP1]], i32 15
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 829a91f..ec83898 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -77,7 +77,6 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    store i64 0, ptr [[TMP26]], align 8
 ; VEC-NEXT:    [[TMP27:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
 ; VEC-NEXT:    [[TMP28]] = zext <4 x i16> [[TMP27]] to <4 x i64>
-; VEC-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP28]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
 ; VEC-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 447f0b0..8dd1e71 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -560,7 +560,6 @@ define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %vector.ph ], [ [[TMP0:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
@@ -604,7 +603,6 @@ define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %vector.ph ], [ [[TMP0:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
@@ -653,7 +651,6 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT3]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[BROADCAST_SPLAT4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR2]], <4 x double> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
 ; CHECK-NEXT:    store double [[TMP6]], ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
index 937e01e..7e15f03 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
@@ -220,22 +220,54 @@ exit:
 define void @test_pr54233_for_depend_on_each_other(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: @test_pr54233_for_depend_on_each_other(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:    [[TMP4]] = xor <4 x i32> <i32 12, i32 12, i32 12, i32 12>, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i32> [[TMP7]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_2_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[FOR_2]], 10
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[FOR_2]], [[FOR_1]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR4:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[FOR_2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SCALAR_RECUR4]], 10
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[SCALAR_RECUR4]], [[SCALAR_RECUR]]
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[SHL]], 255
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[XOR]], [[OR]]
-; CHECK-NEXT:    [[FOR_1_NEXT]] = xor i32 12, [[FOR_2]]
-; CHECK-NEXT:    [[FOR_2_NEXT]] = load i32, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = xor i32 12, [[SCALAR_RECUR4]]
+; CHECK-NEXT:    [[FOR_2_NEXT]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    store i32 [[AND]], ptr [[A_GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 6d3654d..665ac0e 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -911,8 +911,6 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], <i32 -4, i32 -4, i32 -4, i32 -4>
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 -4, i32 -4, i32 -4, i32 -4>
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1006,7 +1004,6 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 -4, i32 -4, i32 -4, i32 -4>
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1108,10 +1105,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP20]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP21]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP22]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> poison, i32 [[TMP27]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP23]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP24]], align 4
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -1120,8 +1113,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP36]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[VECTOR_RECUR_EXTRACT]], i32 3
-; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP34]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP34]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP45]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1227,7 +1218,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 1
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]], i32 2
 ; SINK-AFTER-NEXT:    [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[VECTOR_RECUR_EXTRACT]], i32 3
-; SINK-AFTER-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1276,7 +1266,6 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ <i64 1, i64 1, i64 1, i64 1>, [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1335,7 +1324,6 @@ define i64 @constant_folded_previous_value() {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ <i64 1, i64 1, i64 1, i64 1>, [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; SINK-AFTER-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1392,8 +1380,6 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP1]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
@@ -1463,7 +1449,6 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
@@ -2571,8 +2556,6 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[STEP_ADD]], <i16 1, i16 1, i16 1, i16 1>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; UNROLL-NO-IC-NEXT:    [[TMP5]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR2]], <4 x i32> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP2]], <i16 5, i16 5, i16 5, i16 5>
 ; UNROLL-NO-IC-NEXT:    [[TMP9]] = add <4 x i16> [[TMP3]], <i16 5, i16 5, i16 5, i16 5>
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -2678,7 +2661,6 @@ define void @sink_dead_inst(ptr %a) {
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
 ; SINK-AFTER-NEXT:    [[TMP2]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR1]], <4 x i32> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP4]] = add <4 x i16> [[TMP1]], <i16 5, i16 5, i16 5, i16 5>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = sub <4 x i16> [[TMP5]], <i16 10, i16 10, i16 10, i16 10>
@@ -3490,8 +3472,6 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = or <4 x i16> [[TMP3]], [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
 ; UNROLL-NO-IC-NEXT:    [[TMP7]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
@@ -3590,7 +3570,6 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
 ; SINK-AFTER-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[TMP0]]
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
 ; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 4
@@ -3664,8 +3643,6 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], <i16 1, i16 1, i16 1, i16 1>
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[TMP0]], <i16 5, i16 5, i16 5, i16 5>
 ; UNROLL-NO-IC-NEXT:    [[TMP3]] = add <4 x i16> [[TMP1]], <i16 5, i16 5, i16 5, i16 5>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], <i16 4, i16 4, i16 4, i16 4>
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -3735,7 +3712,6 @@ define void @unused_recurrence(ptr %a) {
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
 ; SINK-AFTER-NEXT:    [[TMP1]] = add <4 x i16> [[TMP0]], <i16 5, i16 5, i16 5, i16 5>
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index b959894..b372105 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -171,8 +171,6 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-LABEL: @constant_folded_previous_value
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i64> [ %vector.recur.init, %vector.ph ], [ shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %vector.body ]
-; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %vector.recur, <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
-; CHECK-VF4UF2: %[[SPLICE2:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
 ; CHECK-VF4UF2: br i1 {{.*}}, label %middle.block, label %vector.body
 entry:
   br label %scalar.body
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll b/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll
index 0abca5b..8e06cd5 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll
@@ -284,7 +284,7 @@ define i32 @test10(i64 %t6) {
 
 declare i1 @llvm.expect.i1(i1, i1) nounwind readnone
 
-; CHECK: !0 = !{!"branch_weights", i32 2000, i32 1}
-; CHECK: !1 = !{!"branch_weights", i32 1, i32 2000}
-; CHECK: !2 = !{!"branch_weights", i32 1, i32 1, i32 2000}
-; CHECK: !3 = !{!"branch_weights", i32 2000, i32 1, i32 1}
+; CHECK: !0 = !{!"branch_weights", !"expected", i32 2000, i32 1}
+; CHECK: !1 = !{!"branch_weights", !"expected", i32 1, i32 2000}
+; CHECK: !2 = !{!"branch_weights", !"expected", i32 1, i32 1, i32 2000}
+; CHECK: !3 = !{!"branch_weights", !"expected", i32 2000, i32 1, i32 1}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll b/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll
index 6429355..4057127 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll
@@ -284,7 +284,7 @@ define i32 @test10(i64 %t6) {
 
 declare i1 @llvm.expect.with.probability.i1(i1, i1, double) nounwind readnone
 
-; CHECK: !0 = !{!"branch_weights", i32 1717986918, i32 429496731}
-; CHECK: !1 = !{!"branch_weights", i32 429496731, i32 1717986918}
-; CHECK: !2 = !{!"branch_weights", i32 214748366, i32 214748366, i32 1717986918}
-; CHECK: !3 = !{!"branch_weights", i32 1717986918, i32 214748366, i32 214748366}
+; CHECK: !0 = !{!"branch_weights", !"expected", i32 1717986918, i32 429496731}
+; CHECK: !1 = !{!"branch_weights", !"expected", i32 429496731, i32 1717986918}
+; CHECK: !2 = !{!"branch_weights", !"expected", i32 214748366, i32 214748366, i32 1717986918}
+; CHECK: !3 = !{!"branch_weights", !"expected", i32 1717986918, i32 214748366, i32 214748366}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll b/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
index 2bcfb1e..458a775 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
@@ -99,6 +99,5 @@ attributes #1 = { nounwind readnone }
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{!"clang version 5.0.0 (trunk 304373)"}
-; CHECK: [[LIKELY]] = !{!"branch_weights", i32 2000, i32 1}
-; CHECK: [[UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000}
-
+; CHECK: [[LIKELY]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
+; CHECK: [[UNLIKELY]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
index 32ae9b0..9b9d9a7 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
@@ -352,5 +352,5 @@ declare i64 @llvm.expect.i64(i64, i64)
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 5.0.0 (trunk 302965)"}
-; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1}
-; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
+; CHECK: [[WEIGHT2]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll
index 1efa632..e9a843225 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll
@@ -99,5 +99,5 @@ declare i64 @llvm.expect.i64(i64, i64)
 
 
 !0 = !{!"clang version 5.0.0 (trunk 302965)"}
-; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1}
-; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
+; CHECK: [[WEIGHT2]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
index 9cbaca8..13db2c3 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
@@ -53,4 +53,4 @@ declare i64 @llvm.expect.i64(i64, i64)
 
 !0 = !{!"clang version 5.0.0 (trunk 302965)"}
 
-; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 1, i32 2000}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll
index 2bad663..275731d 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll
@@ -235,5 +235,5 @@ block5:
   ret void
 }
 
-; CHECK: !0 = !{!"branch_weights", i32 2147483647, i32 1}
-; CHECK: !1 = !{!"branch_weights", i32 1, i32 2147483647}
+; CHECK: !0 = !{!"branch_weights", !"expected", i32 2147483647, i32 1}
+; CHECK: !1 = !{!"branch_weights", !"expected", i32 1, i32 2147483647}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
index 55dd28b..a090fe2 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -293,7 +293,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[PROF4]] = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: [[PROF4]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 5cbf50e..db16413 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -182,11 +182,11 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.1:
 ; CHECK-NEXT:    [[INDEX_1:%.*]] = phi i64 [ 0, [[VECTOR_PH_1]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY_1]] ]
 ; CHECK-NEXT:    [[TMP33:%.*]] = add nuw nsw i64 [[INDEX_1]], 15
-; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[INDEX_1]], 16
+; CHECK-NEXT:    [[TMP34:%.*]] = add nuw nsw i64 [[INDEX_1]], 16
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x i64> poison, i64 [[TMP33]], i64 0
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i64> [[TMP35]], i64 [[TMP34]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX_1]], 17
-; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[INDEX_1]], 18
+; CHECK-NEXT:    [[TMP37:%.*]] = add nuw nsw i64 [[INDEX_1]], 17
+; CHECK-NEXT:    [[TMP38:%.*]] = add nuw nsw i64 [[INDEX_1]], 18
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[TMP37]], i64 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[TMP38]], i64 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ult <2 x i64> [[TMP36]], <i64 225, i64 225>
@@ -259,11 +259,11 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.2:
 ; CHECK-NEXT:    [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH_2]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY_2]] ]
 ; CHECK-NEXT:    [[TMP64:%.*]] = add nuw nsw i64 [[INDEX_2]], 30
-; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX_2]], 31
+; CHECK-NEXT:    [[TMP65:%.*]] = add nuw nsw i64 [[INDEX_2]], 31
 ; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <2 x i64> poison, i64 [[TMP64]], i64 0
 ; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <2 x i64> [[TMP66]], i64 [[TMP65]], i64 1
-; CHECK-NEXT:    [[TMP68:%.*]] = add i64 [[INDEX_2]], 32
-; CHECK-NEXT:    [[TMP69:%.*]] = add i64 [[INDEX_2]], 33
+; CHECK-NEXT:    [[TMP68:%.*]] = add nuw nsw i64 [[INDEX_2]], 32
+; CHECK-NEXT:    [[TMP69:%.*]] = add nuw nsw i64 [[INDEX_2]], 33
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <2 x i64> poison, i64 [[TMP68]], i64 0
 ; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i64> [[TMP70]], i64 [[TMP69]], i64 1
 ; CHECK-NEXT:    [[TMP72:%.*]] = icmp ult <2 x i64> [[TMP67]], <i64 225, i64 225>
@@ -336,11 +336,11 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.3:
 ; CHECK-NEXT:    [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH_3]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY_3]] ]
 ; CHECK-NEXT:    [[TMP95:%.*]] = add nuw nsw i64 [[INDEX_3]], 45
-; CHECK-NEXT:    [[TMP96:%.*]] = add i64 [[INDEX_3]], 46
+; CHECK-NEXT:    [[TMP96:%.*]] = add nuw nsw i64 [[INDEX_3]], 46
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i64> poison, i64 [[TMP95]], i64 0
 ; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i64> [[TMP97]], i64 [[TMP96]], i64 1
-; CHECK-NEXT:    [[TMP99:%.*]] = add i64 [[INDEX_3]], 47
-; CHECK-NEXT:    [[TMP100:%.*]] = add i64 [[INDEX_3]], 48
+; CHECK-NEXT:    [[TMP99:%.*]] = add nuw nsw i64 [[INDEX_3]], 47
+; CHECK-NEXT:    [[TMP100:%.*]] = add nuw nsw i64 [[INDEX_3]], 48
 ; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i64> poison, i64 [[TMP99]], i64 0
 ; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <2 x i64> [[TMP101]], i64 [[TMP100]], i64 1
 ; CHECK-NEXT:    [[TMP103:%.*]] = icmp ult <2 x i64> [[TMP98]], <i64 225, i64 225>
diff --git a/llvm/test/Transforms/Reassociate/repeats.ll b/llvm/test/Transforms/Reassociate/repeats.ll
index ba25c4b..8600777 100644
--- a/llvm/test/Transforms/Reassociate/repeats.ll
+++ b/llvm/test/Transforms/Reassociate/repeats.ll
@@ -60,7 +60,8 @@ define i3 @foo3x5(i3 %x) {
 ; CHECK-SAME: i3 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[X]], [[X]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i3 [[TMP3]], [[X]]
-; CHECK-NEXT:    ret i3 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i3 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret i3 [[TMP5]]
 ;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
@@ -74,7 +75,8 @@ define i3 @foo3x5_nsw(i3 %x) {
 ; CHECK-LABEL: define i3 @foo3x5_nsw(
 ; CHECK-SAME: i3 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i3 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i3 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i3 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret i3 [[TMP4]]
 ;
   %tmp1 = mul i3 %x, %x
@@ -89,7 +91,8 @@ define i3 @foo3x6(i3 %x) {
 ; CHECK-LABEL: define i3 @foo3x6(
 ; CHECK-SAME: i3 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i3 [[X]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i3 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i3 [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    ret i3 [[TMP2]]
 ;
   %tmp1 = mul i3 %x, %x
@@ -106,7 +109,9 @@ define i3 @foo3x7(i3 %x) {
 ; CHECK-SAME: i3 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i3 [[X]], [[X]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i3 [[TMP5]], [[X]]
-; CHECK-NEXT:    ret i3 [[TMP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[TMP6]], [[X]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i3 [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    ret i3 [[TMP7]]
 ;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
@@ -123,7 +128,8 @@ define i4 @foo4x8(i4 %x) {
 ; CHECK-SAME: i4 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    ret i4 [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    ret i4 [[TMP3]]
 ;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
@@ -140,8 +146,9 @@ define i4 @foo4x9(i4 %x) {
 ; CHECK-LABEL: define i4 @foo4x9(
 ; CHECK-SAME: i4 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i4 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i4 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    ret i4 [[TMP8]]
 ;
   %tmp1 = mul i4 %x, %x
@@ -160,7 +167,8 @@ define i4 @foo4x10(i4 %x) {
 ; CHECK-LABEL: define i4 @foo4x10(
 ; CHECK-SAME: i4 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]]
 ; CHECK-NEXT:    ret i4 [[TMP3]]
 ;
@@ -181,7 +189,8 @@ define i4 @foo4x11(i4 %x) {
 ; CHECK-LABEL: define i4 @foo4x11(
 ; CHECK-SAME: i4 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i4 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    ret i4 [[TMP10]]
@@ -204,7 +213,9 @@ define i4 @foo4x12(i4 %x) {
 ; CHECK-LABEL: define i4 @foo4x12(
 ; CHECK-SAME: i4 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    ret i4 [[TMP2]]
 ;
   %tmp1 = mul i4 %x, %x
@@ -227,7 +238,9 @@ define i4 @foo4x13(i4 %x) {
 ; CHECK-SAME: i4 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i4 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i4 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    ret i4 [[TMP12]]
 ;
   %tmp1 = mul i4 %x, %x
@@ -252,7 +265,9 @@ define i4 @foo4x14(i4 %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i4 [[TMP1]], [[X]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i4 [[TMP6]], [[TMP6]]
-; CHECK-NEXT:    ret i4 [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP7]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    ret i4 [[TMP5]]
 ;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
@@ -276,8 +291,10 @@ define i4 @foo4x15(i4 %x) {
 ; CHECK-SAME: i4 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i4 [[TMP1]], [[X]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i4 [[TMP6]], [[X]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i4 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP6]], [[TMP6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i4 [[TMP4]], [[X]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i4 [[TMP5]], [[TMP4]]
 ; CHECK-NEXT:    ret i4 [[TMP14]]
 ;
   %tmp1 = mul i4 %x, %x
diff --git a/llvm/test/Transforms/SimplifyCFG/RISCV/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/RISCV/switch-of-powers-of-two.ll
index 3ded78d..2ac94af 100644
--- a/llvm/test/Transforms/SimplifyCFG/RISCV/switch-of-powers-of-two.ll
+++ b/llvm/test/Transforms/SimplifyCFG/RISCV/switch-of-powers-of-two.ll
@@ -34,10 +34,6 @@ define i32 @switch_of_powers(i32 %x) {
 ; RV64ZBB-LABEL: @switch_of_powers(
 ; RV64ZBB-NEXT:  entry:
 ; RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
-; RV64ZBB-NEXT:    [[SWITCH_MASKINDEX:%.*]] = trunc i32 [[TMP0]] to i8
-; RV64ZBB-NEXT:    [[SWITCH_SHIFTED:%.*]] = lshr i8 121, [[SWITCH_MASKINDEX]]
-; RV64ZBB-NEXT:    [[SWITCH_LOBIT:%.*]] = trunc i8 [[SWITCH_SHIFTED]] to i1
-; RV64ZBB-NEXT:    call void @llvm.assume(i1 [[SWITCH_LOBIT]])
 ; RV64ZBB-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers, i32 0, i32 [[TMP0]]
 ; RV64ZBB-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; RV64ZBB-NEXT:    ret i32 [[SWITCH_LOAD]]
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index d6450a2..845c500 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -38,6 +38,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: @switch.table.threecases = private unnamed_addr constant [3 x i32] [i32 10, i32 7, i32 5], align 4
 ; CHECK: @switch.table.covered_switch_with_bit_tests = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1], align 4
 ; CHECK: @switch.table.signed_overflow1 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 1111, i32 2222], align 4
+; CHECK: @switch.table.signed_overflow2 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 2222, i32 2222], align 4
 ;.
 define i32 @f(i32 %c) {
 ; CHECK-LABEL: @f(
@@ -1738,12 +1739,53 @@ define i32 @signed_overflow2(i8 %n) {
 ; CHECK-LABEL: @signed_overflow2(
 ; CHECK-NEXT:  start:
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[N:%.*]] to i2
-; CHECK-NEXT:    switch i2 [[TRUNC]], label [[BB1:%.*]] [
+; CHECK-NEXT:    [[SWITCH_TABLEIDX:%.*]] = sub i2 [[TRUNC]], -2
+; CHECK-NEXT:    [[SWITCH_TABLEIDX_ZEXT:%.*]] = zext i2 [[SWITCH_TABLEIDX]] to i3
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.signed_overflow2, i32 0, i3 [[SWITCH_TABLEIDX_ZEXT]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+start:
+  %trunc = trunc i8 %n to i2
+  switch i2 %trunc, label %bb1 [
+  i2 1, label %bb3
+  i2 -2, label %bb4
+  i2 -1, label %bb5
+  ]
+
+bb1:                                              ; preds = %start
+  unreachable
+
+bb3:                                              ; preds = %start
+  br label %bb6
+
+bb4:                                              ; preds = %start
+  br label %bb6
+
+bb5:                                              ; preds = %start
+  br label %bb6
+
+bb6:                                              ; preds = %start, %bb3, %bb4, %bb5
+  %.sroa.0.0 = phi i32 [ 4444, %bb5 ], [ 3333, %bb4 ], [ 2222, %bb3 ]
+  ret i32 %.sroa.0.0
+}
+
+; This is the same as @signed_overflow2 except that the default case calls @exit(), so it
+; isn't treated as unreachable
+define i32 @signed_overflow3(i8 %n) {
+; CHECK-LABEL: @signed_overflow3(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[N:%.*]] to i2
+; CHECK-NEXT:    switch i2 [[TRUNC]], label [[START_UNREACHABLEDEFAULT:%.*]] [
 ; CHECK-NEXT:      i2 1, label [[BB6:%.*]]
 ; CHECK-NEXT:      i2 -2, label [[BB4:%.*]]
 ; CHECK-NEXT:      i2 -1, label [[BB5:%.*]]
+; CHECK-NEXT:      i2 0, label [[BB1:%.*]]
 ; CHECK-NEXT:    ]
+; CHECK:       start.unreachabledefault:
+; CHECK-NEXT:    unreachable
 ; CHECK:       bb1:
+; CHECK-NEXT:    call void @exit(i32 1)
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb4:
 ; CHECK-NEXT:    br label [[BB6]]
@@ -1762,6 +1804,7 @@ start:
   ]
 
 bb1:                                              ; preds = %start
+  call void @exit(i32 1)
   unreachable
 
 bb3:                                              ; preds = %start
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
new file mode 100644
index 0000000..7988e30
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt < %s -passes=simplifycfg -switch-to-lookup=true -S | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+target triple = "i386-pc-linux-gnu"
+
+; A dense switch with a reachable default case should be optimized into a lookup table with a bounds check
+;.
+; CHECK: @switch.table.reachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.unreachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.reachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.unreachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.reachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4
+; CHECK: @switch.table.unreachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4
+; CHECK: @switch.table.unreachable_default_holes_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1, i32 0], align 4
+;.
+define i32 @reachable_default_dense_0to31(i32 %x, i32 %y) {
+; CHECK-LABEL: @reachable_default_dense_0to31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 32
+; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN:%.*]]
+; CHECK:       switch.lookup:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [32 x i32], ptr @switch.table.reachable_default_dense_0to31, i32 0, i32 [[X]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SWITCH_LOAD]], [[SWITCH_LOOKUP]] ], [ [[Y:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 3, label %bb5
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 8, label %bb0
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 13, label %bb3
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 18, label %bb6
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 23, label %bb1
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 28, label %bb4
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  ]
+
+sw.default: br label %return
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ %y, %sw.default ], [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
+
+; A dense switch with an unreachable default case should be optimized into a lookup table without bounds checks
+define i32 @unreachable_default_dense_0to31(i32 %x, i32 %y) {
+; CHECK-LABEL: @unreachable_default_dense_0to31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [32 x i32], ptr @switch.table.unreachable_default_dense_0to31, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 3, label %bb5
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 8, label %bb0
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 13, label %bb3
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 18, label %bb6
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 23, label %bb1
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 28, label %bb4
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
+
+; A sparse switch with a reachable default case should be optimized into a lookup table with a bounds check and a mask
+define i32 @reachable_default_holes_0to31(i32 %x, i32 %y) {
+; CHECK-LABEL: @reachable_default_holes_0to31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 32
+; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_HOLE_CHECK:%.*]], label [[RETURN:%.*]]
+; CHECK:       switch.hole_check:
+; CHECK-NEXT:    [[SWITCH_SHIFTED:%.*]] = lshr i32 -277094665, [[X]]
+; CHECK-NEXT:    [[SWITCH_LOBIT:%.*]] = trunc i32 [[SWITCH_SHIFTED]] to i1
+; CHECK-NEXT:    br i1 [[SWITCH_LOBIT]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN]]
+; CHECK:       switch.lookup:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [32 x i32], ptr @switch.table.reachable_default_holes_0to31, i32 0, i32 [[X]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SWITCH_LOAD]], [[SWITCH_LOOKUP]] ], [ [[Y:%.*]], [[SWITCH_HOLE_CHECK]] ], [ [[Y]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  ]
+
+sw.default: br label %return
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ %y, %sw.default ], [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
+
+; A sparse switch with an unreachable default case should be optimized into a lookup table without bounds checks
+define i32 @unreachable_default_holes_0to31(i32 %x, i32 %y) {
+; CHECK-LABEL: @unreachable_default_holes_0to31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [32 x i32], ptr @switch.table.unreachable_default_holes_0to31, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
+
+; A dense switch with a reachable default case should be optimized into a lookup table with a bounds check
+define i32 @reachable_default_dense_0to32(i32 %x, i32 %y) {
+; CHECK-LABEL: @reachable_default_dense_0to32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 33
+; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN:%.*]]
+; CHECK:       switch.lookup:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [33 x i32], ptr @switch.table.reachable_default_dense_0to32, i32 0, i32 [[X]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SWITCH_LOAD]], [[SWITCH_LOOKUP]] ], [ [[Y:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 3, label %bb5
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 8, label %bb0
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 13, label %bb3
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 18, label %bb6
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 23, label %bb1
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 28, label %bb4
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  i32 32, label %bb0
+  ]
+
+sw.default: br label %return
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ %y, %sw.default ], [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
+
+; A dense switch with an unreachable default case should be optimized into a lookup table without bounds checks
+define i32 @unreachable_default_dense_0to32(i32 %x, i32 %y) {
+; CHECK-LABEL: @unreachable_default_dense_0to32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [33 x i32], ptr @switch.table.unreachable_default_dense_0to32, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 3, label %bb5
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 8, label %bb0
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 13, label %bb3
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 18, label %bb6
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 23, label %bb1
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 28, label %bb4
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  i32 32, label %bb0
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
+
+; A sparse switch with a reachable default case which would be optimized into a lookup table with a bounds check and a mask, but doesn't because
+; it would require a 33-bit mask
+define i32 @reachable_default_holes_0to32(i32 %x, i32 %y) {
+; CHECK-LABEL: @reachable_default_holes_0to32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[RETURN:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB7:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB6:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB4:%.*]]
+; CHECK-NEXT:      i32 5, label [[BB3:%.*]]
+; CHECK-NEXT:      i32 6, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 7, label [[BB1:%.*]]
+; CHECK-NEXT:      i32 9, label [[BB7]]
+; CHECK-NEXT:      i32 10, label [[BB6]]
+; CHECK-NEXT:      i32 11, label [[BB5:%.*]]
+; CHECK-NEXT:      i32 12, label [[BB4]]
+; CHECK-NEXT:      i32 14, label [[BB2]]
+; CHECK-NEXT:      i32 15, label [[BB1]]
+; CHECK-NEXT:      i32 16, label [[BB0]]
+; CHECK-NEXT:      i32 17, label [[BB7]]
+; CHECK-NEXT:      i32 19, label [[BB5]]
+; CHECK-NEXT:      i32 20, label [[BB4]]
+; CHECK-NEXT:      i32 21, label [[BB3]]
+; CHECK-NEXT:      i32 22, label [[BB2]]
+; CHECK-NEXT:      i32 24, label [[BB0]]
+; CHECK-NEXT:      i32 25, label [[BB7]]
+; CHECK-NEXT:      i32 26, label [[BB6]]
+; CHECK-NEXT:      i32 27, label [[BB5]]
+; CHECK-NEXT:      i32 29, label [[BB3]]
+; CHECK-NEXT:      i32 30, label [[BB2]]
+; CHECK-NEXT:      i32 31, label [[BB1]]
+; CHECK-NEXT:      i32 32, label [[BB0]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       bb5:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 0, [[BB0]] ], [ 1, [[BB1]] ], [ 2, [[BB2]] ], [ 3, [[BB3]] ], [ 4, [[BB4]] ], [ 5, [[BB5]] ], [ 6, [[BB6]] ], [ 7, [[BB7]] ], [ [[Y:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  i32 32, label %bb0
+  ]
+
+sw.default: br label %return
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ %y, %sw.default ], [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
+
+; A sparse switch with an unreachable default case which can be optimized into a lookup table without bounds checks. Because the default case is
+; unreachable, the fact that a 33-bit mask would be required doesn't prevent lookup table optimization.
+define i32 @unreachable_default_holes_0to32(i32 %x, i32 %y) {
+; CHECK-LABEL: @unreachable_default_holes_0to32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [33 x i32], ptr @switch.table.unreachable_default_holes_0to32, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 1, label %bb7
+  i32 2, label %bb6
+  i32 4, label %bb4
+  i32 5, label %bb3
+  i32 6, label %bb2
+  i32 7, label %bb1
+  i32 9, label %bb7
+  i32 10, label %bb6
+  i32 11, label %bb5
+  i32 12, label %bb4
+  i32 14, label %bb2
+  i32 15, label %bb1
+  i32 16, label %bb0
+  i32 17, label %bb7
+  i32 19, label %bb5
+  i32 20, label %bb4
+  i32 21, label %bb3
+  i32 22, label %bb2
+  i32 24, label %bb0
+  i32 25, label %bb7
+  i32 26, label %bb6
+  i32 27, label %bb5
+  i32 29, label %bb3
+  i32 30, label %bb2
+  i32 31, label %bb1
+  i32 32, label %bb0
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb1: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+bb5: br label %return
+bb6: br label %return
+bb7: br label %return
+
+return:
+  %res = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ], [ 5, %bb5 ], [ 6, %bb6 ], [ 7, %bb7 ]
+  ret i32 %res
+
+}
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index 7f7790c..9ed62d3 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -1,19 +1,5 @@
 ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
 
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1)
-define void @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i1 %bool) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i1 %bool
-  ; CHECK-NEXT: %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false)
-  %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false)
-
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i1 %bool
-  ; CHECK-NEXT: %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool)
-  %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool)
-  ret void
-}
-
 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32)
 define void @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs, i32 %arg) {
   ; CHECK: immarg operand has non-immediate parameter
@@ -665,12 +651,3 @@ define void @test_mfma_f32_32x32x1f32(float %arg0, float %arg1, <32 x i32> %arg2
 
   ret void
 }
-
-declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
-define amdgpu_cs void @test_buffer_atomic_fadd(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i1 %slc
-  ; CHECK-NEXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
-  call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
-  ret void
-}
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index e48e3f4..0958e16 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -142,11 +142,10 @@ int main(int argc, char **argv) {
   }
 
   // Convert to new debug format if requested.
-  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug mode");
-  if (UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode) {
-    M->convertToNewDbgValues();
+  M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat &&
+                           WriteNewDbgInfoFormatToBitcode);
+  if (M->IsNewDbgInfoFormat)
     M->removeDebugIntrinsicDeclarations();
-  }
 
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
 
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 35c65f8..0326572 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -228,7 +228,24 @@ int llvm_test_dibuilder(bool NewDebugInfoFormat) {
   LLVMPositionBuilderAtEnd(Builder, FooVarBlock);
   LLVMTypeRef I64 = LLVMInt64TypeInContext(Ctx);
   LLVMValueRef Zero = LLVMConstInt(I64, 0, false);
-  LLVMBuildRet(Builder, Zero);
+  LLVMValueRef Ret = LLVMBuildRet(Builder, Zero);
+
+  // Insert a `phi` before the `ret`. In the new debug info mode we need to
+  // be careful to insert before debug records too, else the debug records
+  // will come before the `phi` (and be absorbed onto it) which is an invalid
+  // state.
+  LLVMValueRef InsertPos = LLVMGetFirstInstruction(FooVarBlock);
+  LLVMPositionBuilderBeforeInstrAndDbgRecords(Builder, InsertPos);
+  LLVMValueRef Phi1 = LLVMBuildPhi(Builder, I64, "p1");
+  LLVMAddIncoming(Phi1, &Zero, &FooEntryBlock, 1);
+  // Do the same again using the other position-setting function.
+  LLVMPositionBuilderBeforeDbgRecords(Builder, FooVarBlock, InsertPos);
+  LLVMValueRef Phi2 = LLVMBuildPhi(Builder, I64, "p2");
+  LLVMAddIncoming(Phi2, &Zero, &FooEntryBlock, 1);
+  // Insert a non-phi before the `ret` but not before the debug records to
+  // test that works as expected.
+  LLVMPositionBuilder(Builder, FooVarBlock, Ret);
+  LLVMBuildAdd(Builder, Phi1, Phi2, "a");
 
   char *MStr = LLVMPrintModuleToString(M);
   puts(MStr);
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index fbbb550..d28af85 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -258,7 +258,7 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          ScopedDbgInfoFormatSetter FormatSetter(*M, WriteNewDbgInfoFormat);
+          M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
           if (WriteNewDbgInfoFormat)
             M->removeDebugIntrinsicDeclarations();
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 7794f2d..b84469d1 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -489,12 +489,6 @@ int main(int argc, char **argv) {
   if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET)
     LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE;
 
-  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format.
-  if (TryUseNewDbgInfoFormat) {
-    // Turn the new debug-info format on.
-    UseNewDbgInfoFormat = true;
-  }
   // Since llvm-link collects multiple IR modules together, for simplicity's
   // sake we disable the "PreserveInputDbgFormat" flag to enforce a single
   // debug info format.
@@ -556,7 +550,7 @@ int main(int argc, char **argv) {
     SetFormat(WriteNewDbgInfoFormat);
     Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) {
-    SetFormat(WriteNewDbgInfoFormatToBitcode);
+    SetFormat(UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode);
     WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
   }
 
diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
index f6a0537..24f4f11 100644
--- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
+++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/IRSimilarityIdentifier.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -22,6 +23,11 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
+extern bool WriteNewDbgInfoFormatToBitcode;
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
                                               StringRef ModuleStr) {
   SMDiagnostic Err;
@@ -1306,19 +1312,18 @@ TEST(IRInstructionMapper, CallBrInstIllegal) {
   ASSERT_GT(UnsignedVec[0], Mapper.IllegalInstrNumber);
 }
 
-// Checks that an debuginfo intrinsics are mapped to be invisible.  Since they
+// Checks that an debuginfo records are mapped to be invisible. Since they
 // do not semantically change the program, they can be recognized as similar.
 TEST(IRInstructionMapper, DebugInfoInvisible) {
   StringRef ModuleString = R"(
                           define i32 @f(i32 %a, i32 %b) {
                           then:
-                            %0 = add i32 %a, %b                    
-                            call void @llvm.dbg.value(metadata !0)
-                            %1 = add i32 %a, %b     
+                            %0 = add i32 %a, %b
+                              #dbg_value(i32 0, !0, !0, !0)
+                            %1 = add i32 %a, %b
                             ret i32 0
                           }
 
-                          declare void @llvm.dbg.value(metadata)
                           !0 = distinct !{!"test\00", i32 10})";
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
@@ -1914,19 +1919,19 @@ TEST(IRSimilarityCandidate, CheckRegionsDifferentTypes) {
   ASSERT_FALSE(longSimCandCompare(InstrList));
 }
 
-// Check that debug instructions do not impact similarity. They are marked as
+// Check that debug records do not impact similarity. They are marked as
 // invisible.
 TEST(IRSimilarityCandidate, IdenticalWithDebug) {
   StringRef ModuleString = R"(
                           define i32 @f(i32 %a, i32 %b) {
                           bb0:
                              %0 = add i32 %a, %b
-                             call void @llvm.dbg.value(metadata !0)
+                               #dbg_value(i32 0, !0, !0, !0)
                              %1 = add i32 %b, %a
                              ret i32 0
                           bb1:
                              %2 = add i32 %a, %b
-                             call void @llvm.dbg.value(metadata !1)
+                               #dbg_value(i32 1, !1, !1, !1)
                              %3 = add i32 %b, %a
                              ret i32 0
                           bb2:
@@ -1935,7 +1940,6 @@ TEST(IRSimilarityCandidate, IdenticalWithDebug) {
                              ret i32 0       
                           }
 
-                          declare void @llvm.dbg.value(metadata)
                           !0 = distinct !{!"test\00", i32 10}
                           !1 = distinct !{!"test\00", i32 11})";
   LLVMContext Context;
diff --git a/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp b/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp
index 95f17f9..d97549c 100644
--- a/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp
+++ b/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp
@@ -99,7 +99,7 @@ TEST_F(ReplaceWithVecLibTest, TestValidMapping) {
                        ElementCount::getScalable(4), /*Masked*/ true,
                        "_ZGVsMxvu"};
   EXPECT_EQ(run(CorrectVD, IR),
-            "Instructions replaced with vector libraries: 1");
+            "Intrinsic calls replaced with vector libraries: 1");
 }
 
 // The VFABI prefix in TLI describes signature which is not matching the powi
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index f873bbd..91a0745 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -25,8 +25,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
@@ -44,8 +42,6 @@ namespace {
 // by DbgVariableRecords, the dbg.value replacement.
 TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -72,8 +68,6 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
   // Fetch the entry block.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
 
@@ -103,16 +97,10 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   EXPECT_TRUE(RetInst->hasDbgRecords());
   auto Range2 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(Range2.begin(), Range2.end()), 1u);
-
-  M->convertFromNewDbgValues();
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"---(
     define dso_local void @func() #0 !dbg !10 {
       %1 = alloca i32, align 4
@@ -150,8 +138,6 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   )---");
   ASSERT_TRUE(M);
 
-  M->convertToNewDbgValues();
-
   Function *F = M->getFunction("func");
 
   BasicBlock &BB = F->getEntryBlock();
@@ -161,14 +147,10 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   BasicBlock &BBBefore = F->getEntryBlock();
   auto I2 = std::prev(BBBefore.end(), 2);
   ASSERT_TRUE(I2->hasDbgRecords());
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -196,8 +178,6 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
 
   // Fetch the entry block,
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
   EXPECT_EQ(BB.size(), 2u);
 
   // Fetch out our two markers,
@@ -295,14 +275,10 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
 
   // Teardown,
   Instr1->insertBefore(BB, BB.begin());
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -332,8 +308,6 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   // Test that the movement of debug-data when using moveBefore etc and
   // insertBefore etc are governed by the "head" bit of iterators.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
 
   // Test that the head bit behaves as expected: it should be set when the
   // code wants the _start_ of the block, but not otherwise.
@@ -404,14 +378,10 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
               DInst->DebugMarker->StoredDbgRecords.empty());
   EXPECT_FALSE(CInst->DebugMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -441,8 +411,6 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   // Check that DbgVariableRecords can be accessed from Instructions without
   // digging into the depths of DbgMarkers.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
 
   Instruction *BInst = &*BB.begin();
   Instruction *CInst = BInst->getNextNode();
@@ -483,8 +451,6 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   CInst->dropOneDbgRecord(DVR1);
   EXPECT_FALSE(CInst->hasDbgRecords());
   EXPECT_EQ(CInst->DebugMarker->StoredDbgRecords.size(), 0u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 /* Let's recall the big illustration from BasicBlock::spliceDebugInfo:
@@ -577,9 +543,7 @@ protected:
   DbgVariableRecord *DVRA, *DVRB, *DVRConst;
 
   void SetUp() override {
-    UseNewDbgInfoFormat = true;
     M = parseIR(C, SpliceTestIR.c_str());
-    M->convertToNewDbgValues();
 
     BBEntry = &M->getFunction("f")->getEntryBlock();
     BBExit = BBEntry->getNextNode();
@@ -599,8 +563,6 @@ protected:
         cast<DbgVariableRecord>(&*CInst->DebugMarker->StoredDbgRecords.begin());
   }
 
-  void TearDown() override { UseNewDbgInfoFormat = false; }
-
   bool InstContainsDbgVariableRecord(Instruction *I, DbgVariableRecord *DVR) {
     for (DbgRecord &D : I->getDbgRecordRange()) {
       if (&D == DVR) {
@@ -1187,8 +1149,6 @@ metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
 // then the trailing DbgVariableRecords should get flushed back out.
 TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1219,7 +1179,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1234,8 +1193,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->DebugMarker);
   EXPECT_EQ(BInst->DebugMarker->StoredDbgRecords.size(), 1u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // When we remove instructions from the program, adjacent DbgVariableRecords
@@ -1244,8 +1201,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
 // dbg.values. Test that this can be replicated correctly by DbgVariableRecords
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1273,7 +1228,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
 )");
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
-  M->convertToNewDbgValues();
 
   // Fetch the relevant instructions from the converted function.
   Instruction *SubInst = &*Entry.begin();
@@ -1316,16 +1270,12 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   EXPECT_EQ(std::distance(R4.begin(), R4.end()), 1u);
   auto R5 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R5.begin(), R5.end()), 1u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // Test instruction removal and re-insertion, this time with one
 // DbgVariableRecord that should hop up one instruction.
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1352,7 +1302,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
 )");
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
-  M->convertToNewDbgValues();
 
   // Fetch the relevant instructions from the converted function.
   Instruction *SubInst = &*Entry.begin();
@@ -1391,8 +1340,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   EXPECT_FALSE(RetInst->hasDbgRecords());
   auto R3 = AddInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R3.begin(), R3.end()), 1u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // Similar to the above, what if we splice into an empty block with debug-info,
@@ -1401,8 +1348,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
 // of the i16 0 dbg.value.
 TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1436,7 +1381,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1463,16 +1407,12 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
 
   // No trailing DbgVariableRecords in the entry block now.
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // Similar test again, but this time: splice the contents of exit into entry,
 // with the intention of leaving the first dbg.value (i16 0) behind.
 TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1506,7 +1446,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1537,16 +1476,12 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   EXPECT_FALSE(Exit.getTrailingDbgRecords()->empty());
   Exit.getTrailingDbgRecords()->eraseFromParent();
   Exit.deleteTrailingDbgRecords();
-
-  UseNewDbgInfoFormat = false;
 }
 
 // What if we moveBefore end() -- there might be no debug-info there, in which
 // case we shouldn't crash.
 TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1576,7 +1511,6 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Move the return to the end of the entry block.
   Instruction *Br = Entry.getTerminator();
@@ -1589,8 +1523,6 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_EQ(Exit.getTrailingDbgRecords(), nullptr);
   EXPECT_FALSE(Ret->hasDbgRecords());
-
-  UseNewDbgInfoFormat = false;
 }
 
 } // End anonymous namespace.
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index ec3f333..cac8acb 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -156,7 +156,7 @@ TEST(StripTest, LoopMetadata) {
   EXPECT_FALSE(BrokenDebugInfo);
 }
 
-TEST(MetadataTest, DeleteInstUsedByDbgValue) {
+TEST(MetadataTest, DeleteInstUsedByDbgRecord) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -187,12 +187,13 @@ TEST(MetadataTest, DeleteInstUsedByDbgValue) {
 
   // Find the dbg.value using %b.
   SmallVector<DbgValueInst *, 1> DVIs;
-  findDbgValues(DVIs, &I);
+  SmallVector<DbgVariableRecord *, 1> DVRs;
+  findDbgValues(DVIs, &I, &DVRs);
 
   // Delete %b. The dbg.value should now point to undef.
   I.eraseFromParent();
-  EXPECT_EQ(DVIs[0]->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(isa<UndefValue>(DVIs[0]->getValue(0)));
+  EXPECT_EQ(DVRs[0]->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(isa<UndefValue>(DVRs[0]->getValue(0)));
 }
 
 TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
@@ -230,8 +231,8 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
 
   // Get the dbg.declare.
   Function &F = *cast<Function>(M->getNamedValue("fun"));
-  DbgVariableIntrinsic *DbgDeclare =
-      cast<DbgVariableIntrinsic>(&F.front().front());
+  DbgVariableRecord *DbgDeclare =
+      cast<DbgVariableRecord>(&*F.front().front().getDbgRecordRange().begin());
   // Check that this form counts as a "no location" marker.
   EXPECT_TRUE(DbgDeclare->isKillLocation());
 }
@@ -239,6 +240,9 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
 // Duplicate of above test, but in DbgVariableRecord representation.
 TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   LLVMContext C;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -264,10 +268,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
-  M->convertToNewDbgValues();
 
   // Find the DbgVariableRecords using %b.
   SmallVector<DbgValueInst *, 2> DVIs;
@@ -289,6 +290,8 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
 // Ensure that the order of dbg.value intrinsics returned by findDbgValues, and
 // their corresponding DbgVariableRecord representation, are consistent.
 TEST(MetadataTest, OrderingOfDbgVariableRecords) {
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -316,8 +319,6 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
     !12 = !DILocalVariable(name: "bar", scope: !6, file: !1, line: 1, type: !10)
 )");
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
 
   SmallVector<DbgValueInst *, 2> DVIs;
@@ -515,14 +516,15 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) {
   Value *V1 = Fun.getArg(0);
   Value *P1 = Fun.getArg(1);
   Value *P2 = Fun.getArg(2);
-  DbgAssignIntrinsic *DAI = cast<DbgAssignIntrinsic>(Fun.begin()->begin());
-  ASSERT_TRUE(V1 == DAI->getVariableLocationOp(0));
-  ASSERT_TRUE(P1 == DAI->getAddress());
+  DbgVariableRecord *DbgAssign = cast<DbgVariableRecord>(
+      &*Fun.front().front().getDbgRecordRange().begin());
+  ASSERT_TRUE(V1 == DbgAssign->getVariableLocationOp(0));
+  ASSERT_TRUE(P1 == DbgAssign->getAddress());
 
 #define TEST_REPLACE(Old, New, ExpectedValue, ExpectedAddr)                    \
-  DAI->replaceVariableLocationOp(Old, New);                                    \
-  EXPECT_EQ(DAI->getVariableLocationOp(0), ExpectedValue);                     \
-  EXPECT_EQ(DAI->getAddress(), ExpectedAddr);
+  DbgAssign->replaceVariableLocationOp(Old, New);                              \
+  EXPECT_EQ(DbgAssign->getVariableLocationOp(0), ExpectedValue);               \
+  EXPECT_EQ(DbgAssign->getAddress(), ExpectedAddr);
 
   // Replace address only.
   TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2);
@@ -533,8 +535,8 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) {
 
   // Replace address only, value uses a DIArgList.
   // Value = {DIArgList(V1)}, Addr = P1.
-  DAI->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1)));
-  DAI->setExpression(DIExpression::get(
+  DbgAssign->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1)));
+  DbgAssign->setExpression(DIExpression::get(
       C, {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_stack_value}));
   TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2);
 #undef TEST_REPLACE
@@ -620,11 +622,11 @@ TEST(AssignmentTrackingTest, Utils) {
   //
   // Check there are two llvm.dbg.assign intrinsics linked to Alloca.
   auto CheckFun1Mapping = [&Alloca]() {
-    auto Markers = at::getAssignmentMarkers(&Alloca);
+    auto Markers = at::getDVRAssignmentMarkers(&Alloca);
     EXPECT_TRUE(std::distance(Markers.begin(), Markers.end()) == 2);
     // Check those two entries are distinct.
-    DbgAssignIntrinsic *First = *Markers.begin();
-    DbgAssignIntrinsic *Second = *std::next(Markers.begin());
+    DbgVariableRecord *First = *Markers.begin();
+    DbgVariableRecord *Second = *std::next(Markers.begin());
     EXPECT_NE(First, Second);
 
     // Check that we can get back to Alloca from each llvm.dbg.assign.
@@ -660,7 +662,7 @@ TEST(AssignmentTrackingTest, Utils) {
   DIAssignID *Fun2ID = cast_or_null<DIAssignID>(
       Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID));
   EXPECT_NE(New, Fun2ID);
-  auto Fun2Markers = at::getAssignmentMarkers(&Fun2Alloca);
+  auto Fun2Markers = at::getDVRAssignmentMarkers(&Fun2Alloca);
   ASSERT_TRUE(std::distance(Fun2Markers.begin(), Fun2Markers.end()) == 1);
   auto Fun2Insts = at::getAssignmentInsts(*Fun2Markers.begin());
   ASSERT_TRUE(std::distance(Fun2Insts.begin(), Fun2Insts.end()) == 1);
@@ -669,10 +671,10 @@ TEST(AssignmentTrackingTest, Utils) {
   // 3. Check that deleting dbg.assigns from a specific instruction works.
   Instruction &Fun3Alloca =
       *M->getFunction("fun3")->getEntryBlock().getFirstNonPHIOrDbg();
-  auto Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca);
+  auto Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca);
   ASSERT_TRUE(std::distance(Fun3Markers.begin(), Fun3Markers.end()) == 1);
   at::deleteAssignmentMarkers(&Fun3Alloca);
-  Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca);
+  Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca);
   EXPECT_EQ(Fun3Markers.empty(), true);
 
   // 4. Check that deleting works and applies only to the target function.
@@ -683,7 +685,7 @@ TEST(AssignmentTrackingTest, Utils) {
   // llvm.dbg.assign.
   EXPECT_EQ(Fun2ID, cast_or_null<DIAssignID>(
                         Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID)));
-  EXPECT_FALSE(at::getAssignmentMarkers(&Fun2Alloca).empty());
+  EXPECT_FALSE(at::getDVRAssignmentMarkers(&Fun2Alloca).empty());
 }
 
 TEST(IRBuilder, GetSetInsertionPointWithEmptyBasicBlock) {
@@ -769,12 +771,12 @@ TEST(AssignmentTrackingTest, InstrMethods) {
   // Use SetVectors to check that the attachments and markers are unique
   // (another test requirement).
   SetVector<Metadata *> OrigIDs;
-  SetVector<DbgAssignIntrinsic *> Markers;
+  SetVector<DbgVariableRecord *> Markers;
   for (const Instruction *SI : Stores) {
     Metadata *ID = SI->getMetadata(LLVMContext::MD_DIAssignID);
     ASSERT_TRUE(OrigIDs.insert(ID));
     ASSERT_TRUE(ID != nullptr);
-    auto Range = at::getAssignmentMarkers(SI);
+    auto Range = at::getDVRAssignmentMarkers(SI);
     ASSERT_TRUE(std::distance(Range.begin(), Range.end()) == 1);
     ASSERT_TRUE(Markers.insert(*Range.begin()));
   }
@@ -867,6 +869,8 @@ TEST(AssignmentTrackingTest, InstrMethods) {
 // dbg.values that have been converted to a non-instruction format.
 TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   LLVMContext C;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -1041,14 +1045,14 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   // The record of those trailing DbgVariableRecords would dangle and cause an
   // assertion failure if it lived until the end of the LLVMContext.
   ExitBlock->deleteTrailingDbgRecords();
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   LLVMContext C;
 
-  // For the purpose of this test, set and un-set the command line option
-  // corresponding to UseNewDbgInfoFormat.
-  UseNewDbgInfoFormat = true;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
 
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -1079,6 +1083,11 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
+  // For the purpose of this test, set and un-set the command line option
+  // corresponding to UseNewDbgInfoFormat, but only after parsing, to ensure
+  // that the IR starts off in the old format.
+  UseNewDbgInfoFormat = true;
+
   // Check that the conversion routines and utilities between dbg.value
   // debug-info format and DbgVariableRecords works.
   Function *F = M->getFunction("f");
@@ -1183,7 +1192,7 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   EXPECT_EQ(DVI2->getVariable(), DLV2);
   EXPECT_EQ(DVI2->getExpression(), Expr2);
 
-  UseNewDbgInfoFormat = false;
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 // Test that the hashing function for DISubprograms representing methods produce
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 2001df0..ff96df8 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -994,17 +994,17 @@ TEST_F(IRBuilderTest, DIBuilder) {
     EXPECT_TRUE(verifyModule(*M));
   };
 
-  // Test in old-debug mode.
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
+  // Test in new-debug mode.
+  EXPECT_TRUE(M->IsNewDbgInfoFormat);
   RunTest();
 
-  // Test in new-debug mode.
-  // Reset the test then call convertToNewDbgValues to flip the flag
+  // Test in old-debug mode.
+  // Reset the test then call convertFromNewDbgValues to flip the flag
   // on the test's Module, Function and BasicBlock.
   TearDown();
   SetUp();
-  M->convertToNewDbgValues();
-  EXPECT_TRUE(M->IsNewDbgInfoFormat);
+  M->convertFromNewDbgValues();
+  EXPECT_FALSE(M->IsNewDbgInfoFormat);
   RunTest();
 }
 
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index b47c73f..b6044b2 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -25,12 +25,15 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm-c/Core.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
 #include <memory>
 
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+
 namespace llvm {
 namespace {
 
@@ -1460,6 +1463,8 @@ TEST(InstructionsTest, GetSplat) {
 
 TEST(InstructionsTest, SkipDebug) {
   LLVMContext C;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C,
                                       R"(
       declare void @llvm.dbg.value(metadata, metadata, metadata)
@@ -1495,6 +1500,7 @@ TEST(InstructionsTest, SkipDebug) {
 
   // After the terminator, there are no non-debug instructions.
   EXPECT_EQ(nullptr, Term->getNextNonDebugInstruction());
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(InstructionsTest, PhiMightNotBeFPMathOperator) {
diff --git a/llvm/unittests/IR/ValueTest.cpp b/llvm/unittests/IR/ValueTest.cpp
index 246c2fc..33a86d5 100644
--- a/llvm/unittests/IR/ValueTest.cpp
+++ b/llvm/unittests/IR/ValueTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 using namespace llvm;
@@ -255,6 +256,8 @@ TEST(ValueTest, getLocalSlotDeath) {
 TEST(ValueTest, replaceUsesOutsideBlock) {
   // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside
   // BB, including dbg.* uses of MetadataAsValue(ValueAsMetadata(this)).
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   const auto *IR = R"(
     define i32 @f() !dbg !6 {
     entry:
@@ -315,6 +318,7 @@ TEST(ValueTest, replaceUsesOutsideBlock) {
   // These users are outside Entry so should be changed.
   ASSERT_TRUE(ExitDbg->getValue(0) == cast<Value>(B));
   ASSERT_TRUE(Ret->getOperand(0) == cast<Value>(B));
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
@@ -359,10 +363,6 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   if (!M)
     Err.print("ValueTest", errs());
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
-  M->convertToNewDbgValues();
-
   auto GetNext = [](auto *I) { return &*++I->getIterator(); };
 
   Function *F = M->getFunction("f");
@@ -389,7 +389,6 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   EXPECT_TRUE(DVR1->getVariableLocationOp(0) == cast<Value>(A));
   // These users are outside Entry so should be changed.
   EXPECT_TRUE(DVR2->getVariableLocationOp(0) == cast<Value>(B));
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 } // end anonymous namespace
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 2642120..15eb59e 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -667,8 +667,12 @@ TEST(MemProf, MissingFrameId) {
 TEST(MemProf, RadixTreeBuilderEmpty) {
   llvm::DenseMap<FrameId, llvm::memprof::LinearFrameId> MemProfFrameIndexes;
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty());
   const auto Mappings = Builder.takeCallStackPos();
   ASSERT_THAT(Mappings, testing::IsEmpty());
@@ -681,8 +685,12 @@ TEST(MemProf, RadixTreeBuilderOne) {
   llvm::SmallVector<llvm::memprof::FrameId> CS1 = {13, 12, 11};
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({
                                            3U, // Size of CS1,
                                            3U, // MemProfFrameIndexes[13]
@@ -704,8 +712,12 @@ TEST(MemProf, RadixTreeBuilderTwo) {
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               testing::ElementsAreArray({
                   2U,                        // Size of CS1
@@ -738,8 +750,12 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS3), CS3});
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS4), CS4});
+  llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
+      FrameHistogram =
+          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
   llvm::memprof::CallStackRadixTreeBuilder Builder;
-  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
+  Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
+                FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
               testing::ElementsAreArray({
                   4U,                        // Size of CS1
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index 51e780c..8488255 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -77,7 +77,8 @@ static void testUnaryOpExhaustive(StringRef Name, UnaryBitsFn BitsFn,
 
 static void testBinaryOpExhaustive(StringRef Name, BinaryBitsFn BitsFn,
                                    BinaryIntFn IntFn,
-                                   bool CheckOptimality = true) {
+                                   bool CheckOptimality = true,
+                                   bool RefinePoisonToZero = false) {
   for (unsigned Bits : {1, 4}) {
     ForeachKnownBits(Bits, [&](const KnownBits &Known1) {
       ForeachKnownBits(Bits, [&](const KnownBits &Known2) {
@@ -99,6 +100,12 @@ static void testBinaryOpExhaustive(StringRef Name, BinaryBitsFn BitsFn,
           EXPECT_TRUE(checkResult(Name, Exact, Computed, {Known1, Known2},
                                   CheckOptimality));
         }
+        // In some cases we choose to return zero if the result is always
+        // poison.
+        if (RefinePoisonToZero && Exact.hasConflict() &&
+            !Known1.hasConflict() && !Known2.hasConflict()) {
+          EXPECT_TRUE(Computed.isZero());
+        }
       });
     });
   }
@@ -313,7 +320,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
   testBinaryOpExhaustive(
       "udiv exact",
       [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::udiv(Known1, Known2, /*Exact*/ true);
+        return KnownBits::udiv(Known1, Known2, /*Exact=*/true);
       },
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         if (N2.isZero() || !N1.urem(N2).isZero())
@@ -335,7 +342,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
   testBinaryOpExhaustive(
       "sdiv exact",
       [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::sdiv(Known1, Known2, /*Exact*/ true);
+        return KnownBits::sdiv(Known1, Known2, /*Exact=*/true);
       },
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         if (N2.isZero() || (N1.isMinSignedValue() && N2.isAllOnes()) ||
@@ -394,11 +401,11 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return N1.shl(N2);
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
   testBinaryOpExhaustive(
       "ushl_ov",
       [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::shl(Known1, Known2, /* NUW */ true);
+        return KnownBits::shl(Known1, Known2, /*NUW=*/true);
       },
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         bool Overflow;
@@ -407,11 +414,11 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return Res;
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
   testBinaryOpExhaustive(
       "shl nsw",
       [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::shl(Known1, Known2, /* NUW */ false, /* NSW */ true);
+        return KnownBits::shl(Known1, Known2, /*NUW=*/false, /*NSW=*/true);
       },
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         bool Overflow;
@@ -420,11 +427,11 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return Res;
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
   testBinaryOpExhaustive(
       "shl nuw",
       [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::shl(Known1, Known2, /* NUW */ true, /* NSW */ true);
+        return KnownBits::shl(Known1, Known2, /*NUW=*/true, /*NSW=*/true);
       },
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         bool OverflowUnsigned, OverflowSigned;
@@ -434,7 +441,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return Res;
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
 
   testBinaryOpExhaustive(
       "lshr",
@@ -446,7 +453,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return N1.lshr(N2);
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
   testBinaryOpExhaustive(
       "lshr exact",
       [](const KnownBits &Known1, const KnownBits &Known2) {
@@ -460,7 +467,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return N1.lshr(N2);
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
   testBinaryOpExhaustive(
       "ashr",
       [](const KnownBits &Known1, const KnownBits &Known2) {
@@ -471,7 +478,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return N1.ashr(N2);
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
   testBinaryOpExhaustive(
       "ashr exact",
       [](const KnownBits &Known1, const KnownBits &Known2) {
@@ -485,7 +492,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
           return std::nullopt;
         return N1.ashr(N2);
       },
-      /*CheckOptimality=*/true);
+      /*CheckOptimality=*/true, /*RefinePoisonToZero=*/true);
   testBinaryOpExhaustive(
       "mul",
       [](const KnownBits &Known1, const KnownBits &Known2) {
@@ -538,7 +545,7 @@ TEST(KnownBitsTest, UnaryExhaustive) {
   testUnaryOpExhaustive(
       "mul self",
       [](const KnownBits &Known) {
-        return KnownBits::mul(Known, Known, /*SelfMultiply*/ true);
+        return KnownBits::mul(Known, Known, /*SelfMultiply=*/true);
       },
       [](const APInt &N) { return N * N; }, /*CheckOptimality=*/false);
 }
@@ -709,8 +716,8 @@ TEST(KnownBitsTest, SExtOrTrunc) {
   const unsigned NarrowerSize = 4;
   const unsigned BaseSize = 6;
   const unsigned WiderSize = 8;
-  APInt NegativeFitsNarrower(BaseSize, -4, /*isSigned*/ true);
-  APInt NegativeDoesntFitNarrower(BaseSize, -28, /*isSigned*/ true);
+  APInt NegativeFitsNarrower(BaseSize, -4, /*isSigned=*/true);
+  APInt NegativeDoesntFitNarrower(BaseSize, -28, /*isSigned=*/true);
   APInt PositiveFitsNarrower(BaseSize, 14);
   APInt PositiveDoesntFitNarrower(BaseSize, 36);
   auto InitKnownBits = [&](KnownBits &Res, const APInt &Input) {
diff --git a/llvm/unittests/Support/KnownBitsTest.h b/llvm/unittests/Support/KnownBitsTest.h
index 556da2b..807ce31 100644
--- a/llvm/unittests/Support/KnownBitsTest.h
+++ b/llvm/unittests/Support/KnownBitsTest.h
@@ -34,13 +34,19 @@ template <typename FnTy> void ForeachKnownBits(unsigned Bits, FnTy Fn) {
 template <typename FnTy>
 void ForeachNumInKnownBits(const KnownBits &Known, FnTy Fn) {
   unsigned Bits = Known.getBitWidth();
-  unsigned Max = 1 << Bits;
-  for (unsigned N = 0; N < Max; ++N) {
-    APInt Num(Bits, N);
-    if ((Num & Known.Zero) != 0 || (~Num & Known.One) != 0)
-      continue;
+  assert(Bits < 32);
+  unsigned Max = 1u << Bits;
+  unsigned Zero = Known.Zero.getZExtValue();
+  unsigned One = Known.One.getZExtValue();
+
+  if (Zero & One) {
+    // Known has a conflict. No values will satisfy it.
+    return;
+  }
 
-    Fn(Num);
+  for (unsigned N = 0; N < Max; ++N) {
+    if ((N & Zero) == 0 && (~N & One) == 0)
+      Fn(APInt(Bits, N));
   }
 }
 
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index b2e57f2..df5ee8a 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1536,7 +1536,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_FCMA, AArch64::AEK_PAUTH}),
             "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
-            "apple-a14", "armv8.5-a", "crypto-neon-fp-armv8",
+            "apple-a14", "armv8.4-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
                 {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2,
                  AArch64::AEK_SHA3, AArch64::AEK_FP, AArch64::AEK_SIMD,
@@ -1544,7 +1544,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT,
                  AArch64::AEK_FCMA, AArch64::AEK_PAUTH}),
-            "8.5-A"),
+            "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-a15", "armv8.6-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
@@ -1579,7 +1579,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PAUTH}),
             "8.6-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
-            "apple-m1", "armv8.5-a", "crypto-neon-fp-armv8",
+            "apple-m1", "armv8.4-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
                 {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2,
                  AArch64::AEK_SHA3, AArch64::AEK_FP, AArch64::AEK_SIMD,
@@ -1587,7 +1587,7 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_FP16,
                  AArch64::AEK_FP16FML, AArch64::AEK_SHA3, AArch64::AEK_JSCVT,
                  AArch64::AEK_FCMA, AArch64::AEK_PAUTH}),
-            "8.5-A"),
+            "8.4-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "apple-m2", "armv8.6-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
index 5e302d9..1d0d56a 100644
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -844,8 +844,9 @@ TEST(CloneFunction, CloneFunctionWithInlinedSubprograms) {
   EXPECT_FALSE(verifyModule(*ImplModule, &errs()));
 
   // Check that DILexicalBlock of inlined function was not cloned.
-  auto DbgDeclareI = Func->begin()->begin();
-  auto ClonedDbgDeclareI = ClonedFunc->begin()->begin();
+  auto DbgDeclareI = Func->begin()->begin()->getDbgRecordRange().begin();
+  auto ClonedDbgDeclareI =
+      ClonedFunc->begin()->begin()->getDbgRecordRange().begin();
   const DebugLoc &DbgLoc = DbgDeclareI->getDebugLoc();
   const DebugLoc &ClonedDbgLoc = ClonedDbgDeclareI->getDebugLoc();
   EXPECT_NE(DbgLoc.get(), ClonedDbgLoc.get());
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 9b11767..316d59a 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -26,6 +27,27 @@
 
 using namespace llvm;
 
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
+extern bool WriteNewDbgInfoFormatToBitcode;
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
+// Backup all of the existing settings that may be modified when
+// PreserveInputDbgFormat=true, so that when the test is finished we return them
+// (and the "preserve" setting) to their original values.
+static auto SaveDbgInfoFormat() {
+  return make_scope_exit(
+      [OldPreserveInputDbgFormat = PreserveInputDbgFormat.getValue(),
+       OldUseNewDbgInfoFormat = UseNewDbgInfoFormat.getValue(),
+       OldWriteNewDbgInfoFormatToBitcode = WriteNewDbgInfoFormatToBitcode,
+       OldWriteNewDbgInfoFormat = WriteNewDbgInfoFormat.getValue()] {
+        PreserveInputDbgFormat = OldPreserveInputDbgFormat;
+        UseNewDbgInfoFormat = OldUseNewDbgInfoFormat;
+        WriteNewDbgInfoFormatToBitcode = OldWriteNewDbgInfoFormatToBitcode;
+        WriteNewDbgInfoFormat = OldWriteNewDbgInfoFormat;
+      });
+}
+
 TEST(Local, RecursivelyDeleteDeadPHINodes) {
   LLVMContext C;
 
@@ -116,7 +138,6 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
 
 TEST(Local, ReplaceDbgDeclare) {
   LLVMContext C;
-
   // Original C source to get debug info for a local variable:
   // void f() { int x; }
   std::unique_ptr<Module> M = parseIR(C,
@@ -124,11 +145,11 @@ TEST(Local, ReplaceDbgDeclare) {
       define void @f() !dbg !8 {
       entry:
         %x = alloca i32, align 4
-        call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13
-        call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13
+          #dbg_declare(ptr %x, !11, !DIExpression(), !13)
+          #dbg_declare(ptr %x, !11, !DIExpression(), !13)
         ret void, !dbg !14
       }
-      declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
       !llvm.dbg.cu = !{!0}
       !llvm.module.flags = !{!3, !4}
       !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
@@ -151,20 +172,18 @@ TEST(Local, ReplaceDbgDeclare) {
   Instruction *Inst = &F->front().front();
   auto *AI = dyn_cast<AllocaInst>(Inst);
   ASSERT_TRUE(AI);
-  Inst = Inst->getNextNode()->getNextNode();
-  ASSERT_TRUE(Inst);
-  auto *DII = dyn_cast<DbgDeclareInst>(Inst);
-  ASSERT_TRUE(DII);
+
   Value *NewBase = Constant::getNullValue(PointerType::getUnqual(C));
   DIBuilder DIB(*M);
   replaceDbgDeclare(AI, NewBase, DIB, DIExpression::ApplyOffset, 0);
 
-  // There should be exactly two dbg.declares.
-  int Declares = 0;
-  for (const Instruction &I : F->front())
-    if (isa<DbgDeclareInst>(I))
-      Declares++;
-  EXPECT_EQ(2, Declares);
+  // There should be exactly two dbg.declares, attached to the terminator.
+  Inst = F->front().getTerminator();
+  ASSERT_TRUE(Inst);
+  EXPECT_TRUE(Inst->hasDbgRecords());
+  EXPECT_EQ(range_size(Inst->getDbgRecordRange()), 2u);
+  for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange()))
+    EXPECT_EQ(DVR.getAddress(), NewBase);
 }
 
 /// Build the dominator tree for the function and run the Test.
@@ -499,11 +518,10 @@ struct SalvageDebugInfoTest : ::testing::Test {
       entry:
         %x = add i32 0, 1
         %y = add i32 %x, 2
-        call void @llvm.dbg.value(metadata i32 %x, metadata !11, metadata !DIExpression()), !dbg !13
-        call void @llvm.dbg.value(metadata i32 %y, metadata !11, metadata !DIExpression()), !dbg !13
+          #dbg_value(i32 %x, !11, !DIExpression(), !13)
+          #dbg_value(i32 %y, !11, !DIExpression(), !13)
         ret void, !dbg !14
       }
-      declare void @llvm.dbg.value(metadata, metadata, metadata)
       !llvm.dbg.cu = !{!0}
       !llvm.module.flags = !{!3, !4}
       !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
@@ -526,48 +544,47 @@ struct SalvageDebugInfoTest : ::testing::Test {
     ASSERT_TRUE(F);
   }
 
-  bool doesDebugValueDescribeX(const DbgValueInst &DI) {
-    if (DI.getNumVariableLocationOps() != 1u)
+  bool doesDebugValueDescribeX(const DbgVariableRecord &DVR) {
+    if (DVR.getNumVariableLocationOps() != 1u)
       return false;
-    const auto &CI = *cast<ConstantInt>(DI.getValue(0));
+    const auto &CI = *cast<ConstantInt>(DVR.getValue(0));
     if (CI.isZero())
-      return DI.getExpression()->getElements().equals(
+      return DVR.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 1, dwarf::DW_OP_stack_value});
     else if (CI.isOneValue())
-      return DI.getExpression()->getElements().empty();
+      return DVR.getExpression()->getElements().empty();
     return false;
   }
 
-  bool doesDebugValueDescribeY(const DbgValueInst &DI) {
-    if (DI.getNumVariableLocationOps() != 1u)
+  bool doesDebugValueDescribeY(const DbgVariableRecord &DVR) {
+    if (DVR.getNumVariableLocationOps() != 1u)
       return false;
-    const auto &CI = *cast<ConstantInt>(DI.getVariableLocationOp(0));
+    const auto &CI = *cast<ConstantInt>(DVR.getVariableLocationOp(0));
     if (CI.isZero())
-      return DI.getExpression()->getElements().equals(
+      return DVR.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 3, dwarf::DW_OP_stack_value});
     else if (CI.isOneValue())
-      return DI.getExpression()->getElements().equals(
+      return DVR.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 2, dwarf::DW_OP_stack_value});
     return false;
   }
 
   void verifyDebugValuesAreSalvaged() {
+    // The function should only contain debug values and a terminator.
+    EXPECT_EQ(F->size(), 1u);
+    EXPECT_TRUE(F->begin()->begin()->isTerminator());
+
     // Check that the debug values for %x and %y are preserved.
     bool FoundX = false;
     bool FoundY = false;
-    for (const Instruction &I : F->front()) {
-      auto DI = dyn_cast<DbgValueInst>(&I);
-      if (!DI) {
-        // The function should only contain debug values and a terminator.
-        ASSERT_TRUE(I.isTerminator());
-        continue;
-      }
-      EXPECT_EQ(DI->getVariable()->getName(), "x");
-      FoundX |= doesDebugValueDescribeX(*DI);
-      FoundY |= doesDebugValueDescribeY(*DI);
+    for (DbgVariableRecord &DVR :
+         filterDbgVars(F->begin()->begin()->getDbgRecordRange())) {
+      EXPECT_EQ(DVR.getVariable()->getName(), "x");
+      FoundX |= doesDebugValueDescribeX(DVR);
+      FoundY |= doesDebugValueDescribeY(DVR);
     }
-    ASSERT_TRUE(FoundX);
-    ASSERT_TRUE(FoundY);
+    EXPECT_TRUE(FoundX);
+    EXPECT_TRUE(FoundY);
   }
 };
 
@@ -590,6 +607,12 @@ TEST_F(SalvageDebugInfoTest, RecursiveBlockSimplification) {
 
 TEST(Local, wouldInstructionBeTriviallyDead) {
   LLVMContext Ctx;
+  // FIXME: PreserveInputDbgFormat is set to true because this test has
+  // been written to expect debug intrinsics rather than debug records.
+  // TODO: This test doesn't have a DbgRecord equivalent form so delete
+  // it when debug intrinsics are removed.
+  auto SettingGuard = SaveDbgInfoFormat();
+  PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE;
   std::unique_ptr<Module> M = parseIR(Ctx,
                                       R"(
     define dso_local void @fun() local_unnamed_addr #0 !dbg !9 {
@@ -683,12 +706,10 @@ TEST(Local, FindDbgUsers) {
                                       R"(
   define dso_local void @fun(ptr %a) #0 !dbg !11 {
   entry:
-    call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19
+      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
     ret void
   }
 
-  declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!2, !3, !9}
   !llvm.ident = !{!10}
@@ -715,9 +736,13 @@ TEST(Local, FindDbgUsers) {
   verifyModule(*M, &errs(), &BrokenDebugInfo);
   ASSERT_FALSE(BrokenDebugInfo);
 
+  // Convert to debug intrinsics as we want to test findDbgUsers and
+  // findDbgValue's debug-intrinsic-finding code here.
+  // TODO: Remove this test when debug intrinsics are removed.
+  M->convertFromNewDbgValues();
+
   Function &Fun = *cast<Function>(M->getNamedValue("fun"));
   Value *Arg = Fun.getArg(0);
-
   SmallVector<DbgVariableIntrinsic *> Users;
   // Arg (%a) is used twice by a single dbg.assign. Check findDbgUsers returns
   // only 1 pointer to it rather than 2.
@@ -738,7 +763,7 @@ TEST(Local, FindDbgRecords) {
                                       R"(
   define dso_local void @fun(ptr %a) #0 !dbg !11 {
   entry:
-    call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19
+      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
     ret void
   }
 
@@ -767,9 +792,6 @@ TEST(Local, FindDbgRecords) {
   bool BrokenDebugInfo = true;
   verifyModule(*M, &errs(), &BrokenDebugInfo);
   ASSERT_FALSE(BrokenDebugInfo);
-  bool NewDbgInfoFormat = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
-  M->convertToNewDbgValues();
 
   Function &Fun = *cast<Function>(M->getNamedValue("fun"));
   Value *Arg = Fun.getArg(0);
@@ -789,12 +811,10 @@ TEST(Local, FindDbgRecords) {
   findDbgValues(Vals, Arg, &Records);
   EXPECT_EQ(Vals.size(), 0u);
   EXPECT_EQ(Records.size(), 1u);
-  UseNewDbgInfoFormat = NewDbgInfoFormat;
 }
 
 TEST(Local, ReplaceAllDbgUsesWith) {
   using namespace llvm::dwarf;
-
   LLVMContext Ctx;
 
   // Note: The datalayout simulates Darwin/x86_64.
@@ -807,39 +827,36 @@ TEST(Local, ReplaceAllDbgUsesWith) {
     define void @f() !dbg !6 {
     entry:
       %a = add i32 0, 1, !dbg !15
-      call void @llvm.dbg.value(metadata i32 %a, metadata !9, metadata !DIExpression()), !dbg !15
 
+        #dbg_value(i32 %a, !9, !DIExpression(), !15)
       %b = add i64 0, 1, !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression()), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 8)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8)), !dbg !16
 
-      %c = inttoptr i64 0 to i64*, !dbg !17
-      call void @llvm.dbg.declare(metadata i64* %c, metadata !13, metadata !DIExpression()), !dbg !17
+        #dbg_value(i64 %b, !11, !DIExpression(), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8), !16)
+      %c = inttoptr i64 0 to ptr, !dbg !17
 
-      %d = inttoptr i64 0 to i32*, !dbg !18
-      call void @llvm.dbg.declare(metadata i32* %d, metadata !20, metadata !DIExpression()), !dbg !18
+        #dbg_declare(ptr %c, !13, !DIExpression(), !17)
+      %d = inttoptr i64 0 to ptr, !dbg !18
 
+        #dbg_declare(ptr %d,  !20,  !DIExpression(), !18)
       %e = add <2 x i16> zeroinitializer, zeroinitializer
-      call void @llvm.dbg.value(metadata <2 x i16> %e, metadata !14, metadata !DIExpression()), !dbg !18
 
+        #dbg_value(<2 x i16> %e, !14, !DIExpression(), !18)
       %f = call i32 @escape(i32 0)
-      call void @llvm.dbg.value(metadata i32 %f, metadata !9, metadata !DIExpression()), !dbg !15
 
+        #dbg_value(i32 %f, !9, !DIExpression(), !15)
       %barrier = call i32 @escape(i32 0)
 
       %g = call i32 @escape(i32 %f)
-      call void @llvm.dbg.value(metadata i32 %g, metadata !9, metadata !DIExpression()), !dbg !15
 
+        #dbg_value(i32 %g, !9, !DIExpression(), !15)
       ret void, !dbg !19
     }
 
-    declare void @llvm.dbg.declare(metadata, metadata, metadata)
-    declare void @llvm.dbg.value(metadata, metadata, metadata)
-
     !llvm.dbg.cu = !{!0}
     !llvm.module.flags = !{!5}
 
@@ -894,38 +911,47 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   EXPECT_TRUE(replaceAllDbgUsesWith(D, C, C, DT));
 
   SmallVector<DbgVariableIntrinsic *, 2> CDbgVals;
-  findDbgUsers(CDbgVals, &C);
-  EXPECT_EQ(2U, CDbgVals.size());
-  EXPECT_TRUE(all_of(CDbgVals, [](DbgVariableIntrinsic *DII) {
-    return isa<DbgDeclareInst>(DII);
-  }));
+  SmallVector<DbgVariableRecord *, 2> CDbgRecords;
+  findDbgUsers(CDbgVals, &C, &CDbgRecords);
+  EXPECT_EQ(0U, CDbgVals.size());
+  EXPECT_EQ(2U, CDbgRecords.size());
+  EXPECT_TRUE(all_of(
+      CDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
 
   EXPECT_TRUE(replaceAllDbgUsesWith(C, D, D, DT));
 
   SmallVector<DbgVariableIntrinsic *, 2> DDbgVals;
-  findDbgUsers(DDbgVals, &D);
-  EXPECT_EQ(2U, DDbgVals.size());
-  EXPECT_TRUE(all_of(DDbgVals, [](DbgVariableIntrinsic *DII) {
-    return isa<DbgDeclareInst>(DII);
-  }));
+  SmallVector<DbgVariableRecord *, 2> DDbgRecords;
+  findDbgUsers(DDbgVals, &D, &DDbgRecords);
+  EXPECT_EQ(0U, DDbgVals.size());
+  EXPECT_EQ(2U, DDbgRecords.size());
+  EXPECT_TRUE(all_of(
+      DDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
 
   // Introduce a use-before-def. Check that the dbg.value for %a is salvaged.
   EXPECT_TRUE(replaceAllDbgUsesWith(A, F_, F_, DT));
 
-  auto *ADbgVal = cast<DbgValueInst>(A.getNextNode());
-  EXPECT_EQ(ADbgVal->getNumVariableLocationOps(), 1u);
-  EXPECT_EQ(ConstantInt::get(A.getType(), 0), ADbgVal->getVariableLocationOp(0));
+  EXPECT_FALSE(A.hasDbgRecords());
+  EXPECT_TRUE(B.hasDbgRecords());
+  DbgVariableRecord *BDbgVal =
+      cast<DbgVariableRecord>(&*B.getDbgRecordRange().begin());
+  EXPECT_EQ(BDbgVal->getNumVariableLocationOps(), 1u);
+  EXPECT_EQ(ConstantInt::get(A.getType(), 0),
+            BDbgVal->getVariableLocationOp(0));
 
   // Introduce a use-before-def. Check that the dbg.values for %f become undef.
   EXPECT_TRUE(replaceAllDbgUsesWith(F_, G, G, DT));
 
-  auto *FDbgVal = cast<DbgValueInst>(F_.getNextNode());
-  EXPECT_EQ(FDbgVal->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(FDbgVal->isKillLocation());
+  DbgVariableRecord *BarrierDbgVal =
+      cast<DbgVariableRecord>(&*Barrier.getDbgRecordRange().begin());
+  EXPECT_EQ(BarrierDbgVal->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(BarrierDbgVal->isKillLocation());
 
-  SmallVector<DbgValueInst *, 1> FDbgVals;
-  findDbgValues(FDbgVals, &F_);
-  EXPECT_EQ(0U, FDbgVals.size());
+  SmallVector<DbgValueInst *, 1> BarrierDbgVals;
+  SmallVector<DbgVariableRecord *, 8> BarrierDbgRecs;
+  findDbgValues(BarrierDbgVals, &F_, &BarrierDbgRecs);
+  EXPECT_EQ(0U, BarrierDbgVals.size());
+  EXPECT_EQ(0U, BarrierDbgRecs.size());
 
   // Simulate i32 -> i64 conversion to test sign-extension. Here are some
   // interesting cases to handle:
@@ -935,13 +961,15 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   //  4-6) like (1-3), but with a fragment
   EXPECT_TRUE(replaceAllDbgUsesWith(B, A, A, DT));
 
-  SmallVector<DbgValueInst *, 8> ADbgVals;
-  findDbgValues(ADbgVals, &A);
-  EXPECT_EQ(6U, ADbgVals.size());
+  SmallVector<DbgValueInst *, 8> BDbgVals;
+  SmallVector<DbgVariableRecord *, 8> BDbgRecs;
+  findDbgValues(BDbgVals, &A, &BDbgRecs);
+  EXPECT_EQ(0U, BDbgVals.size());
+  EXPECT_EQ(6U, BDbgRecs.size());
 
   // Check that %a has a dbg.value with a DIExpression matching \p Ops.
   auto hasADbgVal = [&](ArrayRef<uint64_t> Ops) {
-    return any_of(ADbgVals, [&](DbgValueInst *DVI) {
+    return any_of(BDbgRecs, [&](DbgVariableRecord *DVI) {
       assert(DVI->getVariable()->getName() == "2");
       return DVI->getExpression()->getElements() == Ops;
     });
@@ -1344,6 +1372,11 @@ TEST(Local, ExpressionForConstant) {
 
 TEST(Local, ReplaceDbgVariableRecord) {
   LLVMContext C;
+  // FIXME: PreserveInputDbgFormat is set to true because this test has
+  // been written to expect debug intrinsics rather than debug records; use the
+  // intrinsic format until we update the test checks.
+  auto SettingGuard = SaveDbgInfoFormat();
+  PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE;
 
   // Test that RAUW also replaces the operands of DbgVariableRecord objects,
   // i.e. non-instruction stored debugging information.
diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/DirectiveEmitter.cpp
index 8bc3f02..581bc09 100644
--- a/llvm/utils/TableGen/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/DirectiveEmitter.cpp
@@ -228,6 +228,9 @@ static void EmitDirectivesDecl(RecordKeeper &Records, raw_ostream &OS) {
   GenerateEnumClass(associations, OS, "Association",
                     /*Prefix=*/"", DirLang, /*ExportEnums=*/false);
 
+  GenerateEnumClass(DirLang.getCategories(), OS, "Category", /*Prefix=*/"",
+                    DirLang, /*ExportEnums=*/false);
+
   // Emit Directive enumeration
   GenerateEnumClass(DirLang.getDirectives(), OS, "Directive",
                     DirLang.getDirectivePrefix(), DirLang,
@@ -264,6 +267,7 @@ static void EmitDirectivesDecl(RecordKeeper &Records, raw_ostream &OS) {
   OS << "constexpr std::size_t getMaxLeafCount() { return "
      << GetMaxLeafCount(DirLang) << "; }\n";
   OS << "Association getDirectiveAssociation(Directive D);\n";
+  OS << "Category getDirectiveCategory(Directive D);\n";
   if (EnumHelperFuncs.length() > 0) {
     OS << EnumHelperFuncs;
     OS << "\n";
@@ -743,7 +747,29 @@ static void GenerateGetDirectiveAssociation(const DirectiveLanguage &DirLang,
          << "::" << getAssocName(F->second) << ";\n";
     }
   }
-  OS << "  } // switch(Dir)\n";
+  OS << "  } // switch (Dir)\n";
+  OS << "  llvm_unreachable(\"Unexpected directive\");\n";
+  OS << "}\n";
+}
+
+static void GenerateGetDirectiveCategory(const DirectiveLanguage &DirLang,
+                                         raw_ostream &OS) {
+  std::string LangNamespace = "llvm::" + DirLang.getCppNamespace().str();
+  std::string CategoryTypeName = LangNamespace + "::Category";
+  std::string CategoryNamespace = CategoryTypeName + "::";
+
+  OS << '\n';
+  OS << CategoryTypeName << ' ' << LangNamespace << "::getDirectiveCategory("
+     << GetDirectiveType(DirLang) << " Dir) {\n";
+  OS << "  switch (Dir) {\n";
+
+  for (Record *R : DirLang.getDirectives()) {
+    Directive D{R};
+    OS << "  case " << GetDirectiveName(DirLang, R) << ":\n";
+    OS << "    return " << CategoryNamespace
+       << D.getCategory()->getValueAsString("name") << ";\n";
+  }
+  OS << "  } // switch (Dir)\n";
   OS << "  llvm_unreachable(\"Unexpected directive\");\n";
   OS << "}\n";
 }
@@ -1196,6 +1222,9 @@ void EmitDirectivesBasicImpl(const DirectiveLanguage &DirLang,
   // getDirectiveAssociation(Directive D)
   GenerateGetDirectiveAssociation(DirLang, OS);
 
+  // getDirectiveCategory(Directive D)
+  GenerateGetDirectiveCategory(DirLang, OS);
+
   // Leaf table for getLeafConstructs, etc.
   EmitLeafTable(DirLang, OS, "LeafConstructTable");
 }
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn
index 36957f5..0dc5efc 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn
@@ -55,5 +55,6 @@ static_library("misc") {
     "UnusedParametersCheck.cpp",
     "UnusedUsingDeclsCheck.cpp",
     "UseAnonymousNamespaceCheck.cpp",
+    "UseInternalLinkageCheck.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 8a5f6d1..9d1ec8d 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -320,6 +320,7 @@ if (current_toolchain == default_toolchain) {
       "__chrono/convert_to_tm.h",
       "__chrono/day.h",
       "__chrono/duration.h",
+      "__chrono/exception.h",
       "__chrono/file_clock.h",
       "__chrono/formatter.h",
       "__chrono/hh_mm_ss.h",
@@ -346,6 +347,7 @@ if (current_toolchain == default_toolchain) {
       "__chrono/year_month.h",
       "__chrono/year_month_day.h",
       "__chrono/year_month_weekday.h",
+      "__chrono/zoned_time.h",
       "__compare/common_comparison_category.h",
       "__compare/compare_partial_order_fallback.h",
       "__compare/compare_strong_order_fallback.h",
diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index 0cd3a0d..765e377 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -320,6 +320,9 @@ if (libcxx_enable_experimental) {
         "include/tzdb/types_private.h",
         "include/tzdb/tzdb_list_private.h",
         "include/tzdb/tzdb_private.h",
+        # TODO TZDB The exception could be moved in chrono once the TZDB library
+        # is no longer experimental.
+        "chrono_exception.cpp",
         "time_zone.cpp",
         "tzdb.cpp",
         "tzdb_list.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
index 879b7f0..2ffe83d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
@@ -103,6 +103,7 @@ static_library("LLVMAArch64CodeGen") {
     "//llvm/lib/Transforms/CFGuard",
     "//llvm/lib/Transforms/Scalar",
     "//llvm/lib/Transforms/Utils",
+    "//llvm/lib/Transforms/Vectorize",
   ]
   include_dirs = [ "." ]
   sources = [
@@ -130,7 +131,6 @@ static_library("LLVMAArch64CodeGen") {
     "AArch64ISelLowering.cpp",
     "AArch64InstrInfo.cpp",
     "AArch64LoadStoreOptimizer.cpp",
-    "AArch64LoopIdiomTransform.cpp",
     "AArch64LowerHomogeneousPrologEpilog.cpp",
     "AArch64MCInstLower.cpp",
     "AArch64MIPeepholeOpt.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index 044b781..92337a5 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -8,6 +8,7 @@ static_library("Vectorize") {
   ]
   sources = [
     "LoadStoreVectorizer.cpp",
+    "LoopIdiomVectorize.cpp",
     "LoopVectorizationLegality.cpp",
     "LoopVectorize.cpp",
     "SLPVectorizer.cpp",
diff --git a/llvm/utils/lit/tests/xunit-output.py b/llvm/utils/lit/tests/xunit-output.py
index 92b6932..67d9984 100644
--- a/llvm/utils/lit/tests/xunit-output.py
+++ b/llvm/utils/lit/tests/xunit-output.py
@@ -1,4 +1,4 @@
-# REQUIRES: shell
+# UNSUPPORTED: system-windows
 
 # Check xunit output
 # RUN: rm -rf %t.xunit.xml
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
index 550c5c0..1517f71 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -40,9 +40,14 @@ def ArithIntRangeOpts : Pass<"int-range-optimizations"> {
   let summary = "Do optimizations based on integer range analysis";
   let description = [{
     This pass runs integer range analysis and apllies optimizations based on its
-    results. e.g. replace arith.cmpi with const if it can be inferred from
-    args ranges.
+    results. It replaces operations with known-constant results with said constants,
+    rewrites `(0 <= %x < D) mod D` to `%x`.
   }];
+  // Explicitly depend on "arith" because this pass could create operations in
+  // `arith` out of thin air in some cases.
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect"
+  ];
 }
 
 def ArithEmulateUnsupportedFloats : Pass<"arith-emulate-unsupported-floats"> {
diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h
index 156744b..167e5b7 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h
@@ -27,7 +27,8 @@ namespace arm_sme {
 /// Pass to enable Armv9 Streaming SVE mode.
 std::unique_ptr<Pass> createEnableArmStreamingPass(
     const ArmStreamingMode = ArmStreamingMode::Streaming,
-    const ArmZaMode = ArmZaMode::Disabled, bool onlyIfRequiredByOps = false);
+    const ArmZaMode = ArmZaMode::Disabled, bool ifRequiredByOps = false,
+    bool ifContainsScalableVectors = false);
 
 /// Pass that fuses 'arm_sme.outerproduct' ops into 2-way or 4-way widening
 /// variants.
diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
index 869a031..c1f016d 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
@@ -116,10 +116,14 @@ def EnableArmStreaming
                             "not be used for input and/or output and the "
                             "function must return with ZA unchanged")
            )}]>,
-    Option<"onlyIfRequiredByOps", "only-if-required-by-ops", "bool",
+    Option<"ifRequiredByOps", "if-required-by-ops", "bool",
            /*default=*/"false",
-           "Only apply the selected streaming/ZA modes if the function "
-           " contains ops that require them.">
+           "Only apply the selected streaming/ZA modes if the function contains"
+           " ops that implement the ArmSMETileOpInterface.">,
+    Option<"ifContainsScalableVectors", "if-contains-scalable-vectors",
+           "bool", /*default=*/"false",
+           "Only apply the selected streaming/ZA modes if the function contains"
+           " operations that use scalable vector types.">
   ];
   let dependentDialects = ["func::FuncDialect"];
 }
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
index 1607304..582ab91 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
@@ -37,17 +37,18 @@ def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> {
         is meant to be used for passing additional options that are not in the
         attribute.
       }],
-      "std::optional<SmallVector<char, 0>>", "serializeToObject",
-      (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>,
+      "std::optional<::mlir::SmallVector<char, 0>>", "serializeToObject",
+      (ins "::mlir::Operation*":$module,
+           "const ::mlir::gpu::TargetOptions&":$options)>,
     InterfaceMethod<[{
         Creates a GPU object attribute from a binary string.
 
         The `object` parameter is a binary string. The `options` parameter is
         meant to be used for passing additional options that are not in the
         attribute.
-      }], "Attribute", "createObject",
-        (ins "const SmallVector<char, 0>&":$object,
-             "const gpu::TargetOptions&":$options)>
+      }], "::mlir::Attribute", "createObject",
+        (ins "const ::llvm::SmallVector<char, 0>&":$object,
+             "const ::mlir::gpu::TargetOptions&":$options)>
   ];
 }
 
@@ -112,9 +113,10 @@ def OffloadingLLVMTranslationAttrInterface :
         The first argument has to be a GPU binary operation.
         If the function fails at any point, it must return `failure`.
       }],
-      "LogicalResult", "embedBinary",
-      (ins "Operation*":$binaryOp, "llvm::IRBuilderBase&":$hostBuilder,
-           "LLVM::ModuleTranslation&":$hostModuleTranslation)
+      "::mlir::LogicalResult", "embedBinary",
+      (ins "::mlir::Operation*":$binaryOp,
+           "::llvm::IRBuilderBase&":$hostBuilder,
+           "::mlir::LLVM::ModuleTranslation&":$hostModuleTranslation)
     >,
     InterfaceMethod<[{
         Translates a `gpu.launch_func` op into a sequence of LLVM IR
@@ -128,10 +130,10 @@ def OffloadingLLVMTranslationAttrInterface :
         respectively. If the function fails at any point, it must return
         `failure`.
       }],
-      "LogicalResult", "launchKernel",
-      (ins "Operation*":$launchFunc, "Operation*":$binaryOp,
-           "llvm::IRBuilderBase&":$hostBuilder,
-           "LLVM::ModuleTranslation&":$hostModuleTranslation)
+      "::mlir::LogicalResult", "launchKernel",
+      (ins "::mlir::Operation*":$launchFunc, "::mlir::Operation*":$binaryOp,
+           "::llvm::IRBuilderBase&":$hostBuilder,
+           "::mlir::LLVM::ModuleTranslation&":$hostModuleTranslation)
     >
   ];
 }
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 10719aa..eb81b64 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1241,7 +1241,7 @@ def GPU_BarrierOp : GPU_Op<"barrier"> {
 def GPU_GPUModuleOp : GPU_Op<"module", [
       DataLayoutOpInterface, HasDefaultDLTIDataLayout, IsolatedFromAbove,
       SymbolTable, Symbol, SingleBlockImplicitTerminator<"ModuleEndOp">
-    ]>, Arguments<(ins
+    ]>, Arguments<(ins SymbolNameAttr:$sym_name,
           OptionalAttr<GPUNonEmptyTargetArrayAttr>:$targets,
           OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler)> {
   let summary = "A top level compilation unit containing code to be run on a GPU.";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
index a93964a..f8e8500 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
@@ -716,12 +716,15 @@ def FramePointerKindNonLeaf
     : LLVM_EnumAttrCase<"NonLeaf", "non-leaf", "NonLeaf", 1>;
 def FramePointerKindAll
     : LLVM_EnumAttrCase<"All", "all", "All", 2>;
+def FramePointerKindReserved
+    : LLVM_EnumAttrCase<"Reserved", "reserved", "Reserved", 3>;
 
 def FramePointerKindEnum : LLVM_EnumAttr<
     "FramePointerKind",
     "::llvm::FramePointerKind",
     "LLVM FramePointerKind",
-    [FramePointerKindNone, FramePointerKindNonLeaf, FramePointerKindAll]> {
+    [FramePointerKindNone, FramePointerKindNonLeaf,
+     FramePointerKindAll, FramePointerKindReserved]> {
   let cppNamespace = "::mlir::LLVM::framePointerKind";
 }
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index 3cf81d2..04a6386 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -17,9 +17,13 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/TensorEncoding.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#include "llvm/ADT/bit.h"
+
 //===----------------------------------------------------------------------===//
 //
 // Type aliases to help code be more self-documenting. Unfortunately
@@ -54,6 +58,42 @@ struct COOSegment {
   }
 };
 
+/// A simple wrapper to encode a bitset of (at most 64) levels, currently used
+/// by `sparse_tensor.iterate` operation for the set of levels on which the
+/// coordinates should be loaded.
+class LevelSet {
+  uint64_t bits = 0;
+
+public:
+  LevelSet() = default;
+  explicit LevelSet(uint64_t bits) : bits(bits) {}
+  operator uint64_t() const { return bits; }
+
+  LevelSet &set(unsigned i) {
+    assert(i < 64);
+    bits |= static_cast<uint64_t>(0x01u) << i;
+    return *this;
+  }
+
+  LevelSet &operator|=(LevelSet lhs) {
+    bits |= static_cast<uint64_t>(lhs);
+    return *this;
+  }
+
+  LevelSet &lshift(unsigned offset) {
+    bits = bits << offset;
+    return *this;
+  }
+
+  bool operator[](unsigned i) const {
+    assert(i < 64);
+    return (bits & (1 << i)) != 0;
+  }
+
+  unsigned count() const { return llvm::popcount(bits); }
+  bool empty() const { return bits == 0; }
+};
+
 } // namespace sparse_tensor
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 53dd8e3..69b212c 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -20,6 +20,21 @@ class SparseTensor_Attr<string name,
     : AttrDef<SparseTensor_Dialect, name, traits>;
 
 //===----------------------------------------------------------------------===//
+// A simple bitset attribute wrapped around a single int64_t to encode a set of
+// sparse tensor levels.
+//===----------------------------------------------------------------------===//
+
+def LevelSetAttr :
+    TypedAttrBase<
+      I64, "IntegerAttr",
+      And<[CPred<"::llvm::isa<::mlir::IntegerAttr>($_self)">,
+           CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getType().isInteger(64)">]>,
+      "LevelSet attribute"> {
+  let returnType = [{::mlir::sparse_tensor::LevelSet}];
+  let convertFromStorage = [{::mlir::sparse_tensor::LevelSet($_self.getValue().getZExtValue())}];
+}
+
+//===----------------------------------------------------------------------===//
 // These attributes are just like `IndexAttr` except that they clarify whether
 // the index refers to a dimension (an axis of the semantic tensor) or a level
 // (an axis of the actual storage format).
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 4e4441c..5ae6f9f 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -15,6 +15,8 @@ include "mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td"
 include "mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
 
 //===----------------------------------------------------------------------===//
 // Base class.
@@ -1304,7 +1306,7 @@ def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResu
 
 def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator,
     ParentOneOf<["BinaryOp", "UnaryOp", "ReduceOp", "SelectOp",
-                 "ForeachOp"]>]> {
+                 "ForeachOp", "IterateOp"]>]> {
   let summary = "Yield from sparse_tensor set-like operations";
   let description = [{
       Yields a value from within a `binary`, `unary`, `reduce`,
@@ -1476,7 +1478,7 @@ def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space",
       the returned iteration space covers. `hiLvl - loLvl` defines the dimension of the
       iteration space.
 
-      The type of returned the value is automatically inferred to
+      The type of returned the value is must be
       `!sparse_tensor.iter_space<#INPUT_ENCODING, lvls = $loLvl to $hiLvl>`.
       The returned iteration space can then be iterated over by
       `sparse_tensor.iterate` operations to visit every stored element
@@ -1487,6 +1489,7 @@ def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space",
       // Extracts a 1-D iteration space from a COO tensor at level 1.
       %space = sparse_tensor.iteration.extract_space %sp at %it1 lvls = 1
         : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+       ->!sparse_tensor.iter_space<#COO, lvls = 1>
       ```
   }];
 
@@ -1499,20 +1502,120 @@ def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space",
       return getHiLvl() - getLoLvl();
     }
     ArrayRef<::mlir::sparse_tensor::LevelType> getSpaceLvlTypes() {
-      return getResultSpace().getType().getLvlTypes();
+      return getExtractedSpace().getType().getLvlTypes();
     }
   }];
 
   let arguments = (ins AnySparseTensor:$tensor,
                        Optional<AnySparseIterator>:$parentIter,
                        LevelAttr:$loLvl, LevelAttr:$hiLvl);
-  let results = (outs AnySparseIterSpace:$resultSpace);
+  let results = (outs AnySparseIterSpace:$extractedSpace);
   let assemblyFormat = "$tensor (`at` $parentIter^)? `lvls` `=` custom<LevelRange>($loLvl, $hiLvl) "
-                       " attr-dict `:` type($tensor) (`,` type($parentIter)^)?";
+                       " attr-dict `:` type($tensor) (`,` type($parentIter)^)? "
+                       "`->` qualified(type($extractedSpace))";
 
   let hasVerifier = 1;
 }
 
+def IterateOp : SparseTensor_Op<"iterate",
+    [RecursiveMemoryEffects, RecursivelySpeculatable,
+     DeclareOpInterfaceMethods<LoopLikeOpInterface,
+      ["getInitsMutable", "getLoopResults", "getRegionIterArgs",
+       "getYieldedValuesMutable"]>,
+     DeclareOpInterfaceMethods<RegionBranchOpInterface,
+      ["getEntrySuccessorOperands"]>,
+     SingleBlockImplicitTerminator<"sparse_tensor::YieldOp">]> {
+
+  let summary = "Iterates over a sparse iteration space";
+  let description = [{
+      The `sparse_tensor.iterate` operation represents a loop (nest) over
+      the provided iteration space extracted from a specific sparse tensor.
+      The operation defines an SSA value for a sparse iterator that points
+      to the current stored element in the sparse tensor and SSA values
+      for coordinates of the stored element. The coordinates are always
+      converted to `index` type despite of the underlying sparse tensor
+      storage. When coordinates are not used, the SSA values can be skipped
+      by `_` symbols, which usually leads to simpler generated code after
+      sparsification. For example:
+
+      ```mlir
+      // The coordinate for level 0 is not used when iterating over a 2-D
+      // iteration space.
+      %sparse_tensor.iterate %iterator in %space at(_, %crd_1)
+        : !sparse_tensor.iter_space<#CSR, lvls = 0 to 2>
+      ```
+
+      `sparse_tensor.iterate` can also operate on loop-carried variables.
+      It returns the final values after loop termination.
+      The initial values of the variables are passed as additional SSA operands
+      to the iterator SSA value and used coordinate SSA values mentioned
+      above. The operation region has an argument for the iterator, variadic
+      arguments for specified (used) coordiates and followed by one argument
+      for each loop-carried variable, representing the value of the variable
+      at the current iteration.
+      The body region must contain exactly one block that terminates with
+      `sparse_tensor.yield`.
+
+      The results of an `sparse_tensor.iterate` hold the final values after
+      the last iteration. If the `sparse_tensor.iterate` defines any values,
+      a yield must be explicitly present.
+      The number and types of the `sparse_tensor.iterate` results must match
+      the initial values in the iter_args binding and the yield operands.
+
+
+      A nested `sparse_tensor.iterate` example that prints all the coordinates
+      stored in the sparse input:
+
+      ```mlir
+      func.func @nested_iterate(%sp : tensor<4x8xf32, #COO>) {
+        // Iterates over the first level of %sp
+        %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0
+            : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0 to 1>
+        %r1 = sparse_tensor.iterate %it1 in %l1 at (%coord0)
+            : !sparse_tensor.iter_space<#COO, lvls = 0 to 1>  {
+          // Iterates over the second level of %sp
+          %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1
+              : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0 to 1>
+             -> !sparse_tensor.iter_space<#COO, lvls = 1 to 2>
+          %r2 = sparse_tensor.iterate %it2 in %l2 at (coord1)
+              : !sparse_tensor.iter_space<#COO, lvls = 1 to 2>  {
+             vector.print %coord0 : index
+             vector.print %coord1 : index
+          }
+        }
+      }
+
+      ```
+  }];
+
+  let arguments = (ins AnySparseIterSpace:$iterSpace,
+                       Variadic<AnyType>:$initArgs,
+                       LevelSetAttr:$crdUsedLvls);
+  let results = (outs Variadic<AnyType>:$results);
+  let regions = (region SizedRegion<1>:$region);
+
+  let extraClassDeclaration = [{
+    unsigned getSpaceDim() {
+      return getIterSpace().getType().getSpaceDim();
+    }
+    BlockArgument getIterator() {
+      return getRegion().getArguments().front();
+    }
+    Block::BlockArgListType getCrds() {
+      // The first block argument is iterator, the remaining arguments are
+      // referenced coordinates.
+      return getRegion().getArguments().slice(1, getCrdUsedLvls().count());
+    }
+    unsigned getNumRegionIterArgs() {
+      return getRegion().getArguments().size() - 1 - getCrdUsedLvls().count();
+    }
+  }];
+
+  let hasVerifier = 1;
+  let hasRegionVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse Tensor Debugging and Test-Only Operations.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 2562301..ed7b9ec 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -784,6 +784,7 @@ public:
 /// place.
 class PatternRewriter : public RewriterBase {
 public:
+  explicit PatternRewriter(MLIRContext *ctx) : RewriterBase(ctx) {}
   using RewriterBase::RewriterBase;
 
   /// A hook used to indicate if the pattern rewriter can recover from failure
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 83198c9..f6c5149 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -1088,8 +1088,9 @@ struct ConversionConfig {
 
   /// An optional listener that is notified about all IR modifications in case
   /// dialect conversion succeeds. If the dialect conversion fails and no IR
-  /// modifications are visible (i.e., they were all rolled back), no
-  /// notifications are sent.
+  /// modifications are visible (i.e., they were all rolled back), or if the
+  /// dialect conversion is an "analysis conversion", no notifications are
+  /// sent (apart from `notifyPatternBegin`/notifyPatternEnd`).
   ///
   /// Note: Notifications are sent in a delayed fashion, when the dialect
   /// conversion is guaranteed to succeed. At that point, some IR modifications
diff --git a/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp b/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp
index 6a8ecb7..53b7983 100644
--- a/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp
+++ b/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp
@@ -31,15 +31,14 @@ public:
   LogicalResult
   matchAndRewrite(func::CallOp callOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // Multiple results func was not converted to `emitc.func`.
+    // Multiple results func cannot be converted to `emitc.func`.
     if (callOp.getNumResults() > 1)
       return rewriter.notifyMatchFailure(
           callOp, "only functions with zero or one result can be converted");
 
-    rewriter.replaceOpWithNewOp<emitc::CallOp>(
-        callOp,
-        callOp.getNumResults() ? callOp.getResult(0).getType() : nullptr,
-        adaptor.getOperands(), callOp->getAttrs());
+    rewriter.replaceOpWithNewOp<emitc::CallOp>(callOp, callOp.getResultTypes(),
+                                               adaptor.getOperands(),
+                                               callOp->getAttrs());
 
     return success();
   }
diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
index 367142a..0a89242 100644
--- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
+++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
@@ -102,10 +102,8 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
   // assigned to by emitc::assign ops within the loop body.
   SmallVector<Value> resultVariables =
       createVariablesForResults(forOp, rewriter);
-  SmallVector<Value> iterArgsVariables =
-      createVariablesForResults(forOp, rewriter);
 
-  assignValues(forOp.getInits(), iterArgsVariables, rewriter, loc);
+  assignValues(forOp.getInits(), resultVariables, rewriter, loc);
 
   emitc::ForOp loweredFor = rewriter.create<emitc::ForOp>(
       loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep());
@@ -117,15 +115,12 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
 
   SmallVector<Value> replacingValues;
   replacingValues.push_back(loweredFor.getInductionVar());
-  replacingValues.append(iterArgsVariables.begin(), iterArgsVariables.end());
+  replacingValues.append(resultVariables.begin(), resultVariables.end());
 
   rewriter.mergeBlocks(forOp.getBody(), loweredBody, replacingValues);
-  lowerYield(iterArgsVariables, rewriter,
+  lowerYield(resultVariables, rewriter,
              cast<scf::YieldOp>(loweredBody->getTerminator()));
 
-  // Copy iterArgs into results after the for loop.
-  assignValues(iterArgsVariables, resultVariables, rewriter, loc);
-
   rewriter.replaceOp(forOp, resultVariables);
   return success();
 }
diff --git a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
index 2473169..8005f91 100644
--- a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
@@ -8,11 +8,17 @@
 
 #include <utility>
 
+#include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir::arith {
@@ -24,88 +30,50 @@ using namespace mlir;
 using namespace mlir::arith;
 using namespace mlir::dataflow;
 
-/// Returns true if 2 integer ranges have intersection.
-static bool intersects(const ConstantIntRanges &lhs,
-                       const ConstantIntRanges &rhs) {
-  return !((lhs.smax().slt(rhs.smin()) || lhs.smin().sgt(rhs.smax())) &&
-           (lhs.umax().ult(rhs.umin()) || lhs.umin().ugt(rhs.umax())));
+static std::optional<APInt> getMaybeConstantValue(DataFlowSolver &solver,
+                                                  Value value) {
+  auto *maybeInferredRange =
+      solver.lookupState<IntegerValueRangeLattice>(value);
+  if (!maybeInferredRange || maybeInferredRange->getValue().isUninitialized())
+    return std::nullopt;
+  const ConstantIntRanges &inferredRange =
+      maybeInferredRange->getValue().getValue();
+  return inferredRange.getConstantValue();
 }
 
-static FailureOr<bool> handleEq(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  if (!intersects(lhs, rhs))
-    return false;
-
-  return failure();
-}
-
-static FailureOr<bool> handleNe(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  if (!intersects(lhs, rhs))
-    return true;
-
-  return failure();
-}
-
-static FailureOr<bool> handleSlt(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  if (lhs.smax().slt(rhs.smin()))
-    return true;
-
-  if (lhs.smin().sge(rhs.smax()))
-    return false;
-
-  return failure();
-}
-
-static FailureOr<bool> handleSle(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  if (lhs.smax().sle(rhs.smin()))
-    return true;
-
-  if (lhs.smin().sgt(rhs.smax()))
-    return false;
-
-  return failure();
-}
-
-static FailureOr<bool> handleSgt(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  return handleSlt(std::move(rhs), std::move(lhs));
-}
-
-static FailureOr<bool> handleSge(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  return handleSle(std::move(rhs), std::move(lhs));
-}
-
-static FailureOr<bool> handleUlt(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  if (lhs.umax().ult(rhs.umin()))
-    return true;
-
-  if (lhs.umin().uge(rhs.umax()))
-    return false;
-
-  return failure();
-}
-
-static FailureOr<bool> handleUle(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  if (lhs.umax().ule(rhs.umin()))
-    return true;
-
-  if (lhs.umin().ugt(rhs.umax()))
-    return false;
-
-  return failure();
-}
-
-static FailureOr<bool> handleUgt(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  return handleUlt(std::move(rhs), std::move(lhs));
-}
-
-static FailureOr<bool> handleUge(ConstantIntRanges lhs, ConstantIntRanges rhs) {
-  return handleUle(std::move(rhs), std::move(lhs));
+/// Patterned after SCCP
+static LogicalResult maybeReplaceWithConstant(DataFlowSolver &solver,
+                                              PatternRewriter &rewriter,
+                                              Value value) {
+  if (value.use_empty())
+    return failure();
+  std::optional<APInt> maybeConstValue = getMaybeConstantValue(solver, value);
+  if (!maybeConstValue.has_value())
+    return failure();
+
+  Operation *maybeDefiningOp = value.getDefiningOp();
+  Dialect *valueDialect =
+      maybeDefiningOp ? maybeDefiningOp->getDialect()
+                      : value.getParentRegion()->getParentOp()->getDialect();
+  Attribute constAttr =
+      rewriter.getIntegerAttr(value.getType(), *maybeConstValue);
+  Operation *constOp = valueDialect->materializeConstant(
+      rewriter, constAttr, value.getType(), value.getLoc());
+  // Fall back to arith.constant if the dialect materializer doesn't know what
+  // to do with an integer constant.
+  if (!constOp)
+    constOp = rewriter.getContext()
+                  ->getLoadedDialect<ArithDialect>()
+                  ->materializeConstant(rewriter, constAttr, value.getType(),
+                                        value.getLoc());
+  if (!constOp)
+    return failure();
+
+  rewriter.replaceAllUsesWith(value, constOp->getResult(0));
+  return success();
 }
 
 namespace {
-/// This class listens on IR transformations performed during a pass relying on
-/// information from a `DataflowSolver`. It erases state associated with the
-/// erased operation and its results from the `DataFlowSolver` so that Patterns
-/// do not accidentally query old state information for newly created Ops.
 class DataFlowListener : public RewriterBase::Listener {
 public:
   DataFlowListener(DataFlowSolver &s) : s(s) {}
@@ -120,52 +88,95 @@ protected:
   DataFlowSolver &s;
 };
 
-struct ConvertCmpOp : public OpRewritePattern<arith::CmpIOp> {
+/// Rewrite any results of `op` that were inferred to be constant integers to
+/// and replace their uses with that constant. Return success() if all results
+/// where thus replaced and the operation is erased. Also replace any block
+/// arguments with their constant values.
+struct MaterializeKnownConstantValues : public RewritePattern {
+  MaterializeKnownConstantValues(MLIRContext *context, DataFlowSolver &s)
+      : RewritePattern(Pattern::MatchAnyOpTypeTag(), /*benefit=*/1, context),
+        solver(s) {}
+
+  LogicalResult match(Operation *op) const override {
+    if (matchPattern(op, m_Constant()))
+      return failure();
 
-  ConvertCmpOp(MLIRContext *context, DataFlowSolver &s)
-      : OpRewritePattern<arith::CmpIOp>(context), solver(s) {}
+    auto needsReplacing = [&](Value v) {
+      return getMaybeConstantValue(solver, v).has_value() && !v.use_empty();
+    };
+    bool hasConstantResults = llvm::any_of(op->getResults(), needsReplacing);
+    if (op->getNumRegions() == 0)
+      return success(hasConstantResults);
+    bool hasConstantRegionArgs = false;
+    for (Region &region : op->getRegions()) {
+      for (Block &block : region.getBlocks()) {
+        hasConstantRegionArgs |=
+            llvm::any_of(block.getArguments(), needsReplacing);
+      }
+    }
+    return success(hasConstantResults || hasConstantRegionArgs);
+  }
 
-  LogicalResult matchAndRewrite(arith::CmpIOp op,
+  void rewrite(Operation *op, PatternRewriter &rewriter) const override {
+    bool replacedAll = (op->getNumResults() != 0);
+    for (Value v : op->getResults())
+      replacedAll &=
+          (succeeded(maybeReplaceWithConstant(solver, rewriter, v)) ||
+           v.use_empty());
+    if (replacedAll && isOpTriviallyDead(op)) {
+      rewriter.eraseOp(op);
+      return;
+    }
+
+    PatternRewriter::InsertionGuard guard(rewriter);
+    for (Region &region : op->getRegions()) {
+      for (Block &block : region.getBlocks()) {
+        rewriter.setInsertionPointToStart(&block);
+        for (BlockArgument &arg : block.getArguments()) {
+          (void)maybeReplaceWithConstant(solver, rewriter, arg);
+        }
+      }
+    }
+  }
+
+private:
+  DataFlowSolver &solver;
+};
+
+template <typename RemOp>
+struct DeleteTrivialRem : public OpRewritePattern<RemOp> {
+  DeleteTrivialRem(MLIRContext *context, DataFlowSolver &s)
+      : OpRewritePattern<RemOp>(context), solver(s) {}
+
+  LogicalResult matchAndRewrite(RemOp op,
                                 PatternRewriter &rewriter) const override {
-    auto *lhsResult =
-        solver.lookupState<dataflow::IntegerValueRangeLattice>(op.getLhs());
-    if (!lhsResult || lhsResult->getValue().isUninitialized())
+    Value lhs = op.getOperand(0);
+    Value rhs = op.getOperand(1);
+    auto maybeModulus = getConstantIntValue(rhs);
+    if (!maybeModulus.has_value())
       return failure();
-
-    auto *rhsResult =
-        solver.lookupState<dataflow::IntegerValueRangeLattice>(op.getRhs());
-    if (!rhsResult || rhsResult->getValue().isUninitialized())
+    int64_t modulus = *maybeModulus;
+    if (modulus <= 0)
       return failure();
-
-    using HandlerFunc =
-        FailureOr<bool> (*)(ConstantIntRanges, ConstantIntRanges);
-    std::array<HandlerFunc, arith::getMaxEnumValForCmpIPredicate() + 1>
-        handlers{};
-    using Pred = arith::CmpIPredicate;
-    handlers[static_cast<size_t>(Pred::eq)] = &handleEq;
-    handlers[static_cast<size_t>(Pred::ne)] = &handleNe;
-    handlers[static_cast<size_t>(Pred::slt)] = &handleSlt;
-    handlers[static_cast<size_t>(Pred::sle)] = &handleSle;
-    handlers[static_cast<size_t>(Pred::sgt)] = &handleSgt;
-    handlers[static_cast<size_t>(Pred::sge)] = &handleSge;
-    handlers[static_cast<size_t>(Pred::ult)] = &handleUlt;
-    handlers[static_cast<size_t>(Pred::ule)] = &handleUle;
-    handlers[static_cast<size_t>(Pred::ugt)] = &handleUgt;
-    handlers[static_cast<size_t>(Pred::uge)] = &handleUge;
-
-    HandlerFunc handler = handlers[static_cast<size_t>(op.getPredicate())];
-    if (!handler)
+    auto *maybeLhsRange = solver.lookupState<IntegerValueRangeLattice>(lhs);
+    if (!maybeLhsRange || maybeLhsRange->getValue().isUninitialized())
       return failure();
-
-    ConstantIntRanges lhsValue = lhsResult->getValue().getValue();
-    ConstantIntRanges rhsValue = rhsResult->getValue().getValue();
-    FailureOr<bool> result = handler(lhsValue, rhsValue);
-
-    if (failed(result))
+    const ConstantIntRanges &lhsRange = maybeLhsRange->getValue().getValue();
+    const APInt &min = isa<RemUIOp>(op) ? lhsRange.umin() : lhsRange.smin();
+    const APInt &max = isa<RemUIOp>(op) ? lhsRange.umax() : lhsRange.smax();
+    // The minima and maxima here are given as closed ranges, we must be
+    // strictly less than the modulus.
+    if (min.isNegative() || min.uge(modulus))
+      return failure();
+    if (max.isNegative() || max.uge(modulus))
+      return failure();
+    if (!min.ule(max))
       return failure();
 
-    rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
-        op, static_cast<int64_t>(*result), /*width*/ 1);
+    // With all those conditions out of the way, we know thas this invocation of
+    // a remainder is a noop because the input is strictly within the range
+    // [0, modulus), so get rid of it.
+    rewriter.replaceOp(op, ValueRange{lhs});
     return success();
   }
 
@@ -201,7 +212,8 @@ struct IntRangeOptimizationsPass
 
 void mlir::arith::populateIntRangeOptimizationsPatterns(
     RewritePatternSet &patterns, DataFlowSolver &solver) {
-  patterns.add<ConvertCmpOp>(patterns.getContext(), solver);
+  patterns.add<MaterializeKnownConstantValues, DeleteTrivialRem<RemSIOp>,
+               DeleteTrivialRem<RemUIOp>>(patterns.getContext(), solver);
 }
 
 std::unique_ptr<Pass> mlir::arith::createIntRangeOptimizationsPass() {
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp b/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
index 79a6caf..fb4bb41 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
@@ -58,17 +58,25 @@ constexpr StringLiteral
 struct EnableArmStreamingPass
     : public arm_sme::impl::EnableArmStreamingBase<EnableArmStreamingPass> {
   EnableArmStreamingPass(ArmStreamingMode streamingMode, ArmZaMode zaMode,
-                         bool onlyIfRequiredByOps) {
+                         bool ifRequiredByOps, bool ifContainsScalableVectors) {
     this->streamingMode = streamingMode;
     this->zaMode = zaMode;
-    this->onlyIfRequiredByOps = onlyIfRequiredByOps;
+    this->ifRequiredByOps = ifRequiredByOps;
+    this->ifContainsScalableVectors = ifContainsScalableVectors;
   }
   void runOnOperation() override {
-    auto op = getOperation();
+    auto function = getOperation();
 
-    if (onlyIfRequiredByOps) {
+    if (ifRequiredByOps && ifContainsScalableVectors) {
+      function->emitOpError(
+          "enable-arm-streaming: `if-required-by-ops` and "
+          "`if-contains-scalable-vectors` are mutually exclusive");
+      return signalPassFailure();
+    }
+
+    if (ifRequiredByOps) {
       bool foundTileOp = false;
-      op.walk([&](Operation *op) {
+      function.walk([&](Operation *op) {
         if (llvm::isa<ArmSMETileOpInterface>(op)) {
           foundTileOp = true;
           return WalkResult::interrupt();
@@ -79,27 +87,46 @@ struct EnableArmStreamingPass
         return;
     }
 
-    if (op->getAttr(kEnableArmStreamingIgnoreAttr) ||
+    if (ifContainsScalableVectors) {
+      bool foundScalableVector = false;
+      auto isScalableVector = [&](Type type) {
+        if (auto vectorType = dyn_cast<VectorType>(type))
+          return vectorType.isScalable();
+        return false;
+      };
+      function.walk([&](Operation *op) {
+        if (llvm::any_of(op->getOperandTypes(), isScalableVector) ||
+            llvm::any_of(op->getResultTypes(), isScalableVector)) {
+          foundScalableVector = true;
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
+      if (!foundScalableVector)
+        return;
+    }
+
+    if (function->getAttr(kEnableArmStreamingIgnoreAttr) ||
         streamingMode == ArmStreamingMode::Disabled)
       return;
 
     auto unitAttr = UnitAttr::get(&getContext());
 
-    op->setAttr(stringifyArmStreamingMode(streamingMode), unitAttr);
+    function->setAttr(stringifyArmStreamingMode(streamingMode), unitAttr);
 
     // The pass currently only supports enabling ZA when in streaming-mode, but
     // ZA can be accessed by the SME LDR, STR and ZERO instructions when not in
     // streaming-mode (see section B1.1.1, IDGNQM of spec [1]). It may be worth
     // supporting this later.
     if (zaMode != ArmZaMode::Disabled)
-      op->setAttr(stringifyArmZaMode(zaMode), unitAttr);
+      function->setAttr(stringifyArmZaMode(zaMode), unitAttr);
   }
 };
 } // namespace
 
 std::unique_ptr<Pass> mlir::arm_sme::createEnableArmStreamingPass(
     const ArmStreamingMode streamingMode, const ArmZaMode zaMode,
-    bool onlyIfRequiredByOps) {
-  return std::make_unique<EnableArmStreamingPass>(streamingMode, zaMode,
-                                                  onlyIfRequiredByOps);
+    bool ifRequiredByOps, bool ifContainsScalableVectors) {
+  return std::make_unique<EnableArmStreamingPass>(
+      streamingMode, zaMode, ifRequiredByOps, ifContainsScalableVectors);
 }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
index a8ec111..bd5c4d4 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
@@ -822,10 +822,11 @@ FailureOr<Operation *> BufferDeallocation::handleInterface(CallOpInterface op) {
 
   // Lookup the function operation and check if it has private visibility. If
   // the function is referenced by SSA value instead of a Symbol, it's assumed
-  // to be always private.
+  // to be public. (And we cannot easily change the type of the SSA value
+  // anyway.)
   Operation *funcOp = op.resolveCallable(state.getSymbolTable());
-  bool isPrivate = true;
-  if (auto symbol = dyn_cast<SymbolOpInterface>(funcOp))
+  bool isPrivate = false;
+  if (auto symbol = dyn_cast_or_null<SymbolOpInterface>(funcOp))
     isPrivate = symbol.isPrivate() && !symbol.isDeclaration();
 
   // If the private-function-dynamic-ownership option is enabled and we are
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 0c2590d..d8e29da 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1730,12 +1730,11 @@ void GPUModuleOp::build(OpBuilder &builder, OperationState &result,
                         StringRef name, ArrayAttr targets,
                         Attribute offloadingHandler) {
   ensureTerminator(*result.addRegion(), builder, result.location);
-  result.attributes.push_back(builder.getNamedAttr(
-      ::mlir::SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)));
 
   Properties &props = result.getOrAddProperties<Properties>();
   if (targets)
     props.targets = targets;
+  props.setSymName(builder.getStringAttr(name));
   props.offloadingHandler = offloadingHandler;
 }
 
@@ -1751,11 +1750,11 @@ ParseResult GPUModuleOp::parse(OpAsmParser &parser, OperationState &result) {
   StringAttr nameAttr;
   ArrayAttr targetsAttr;
 
-  if (parser.parseSymbolName(nameAttr, mlir::SymbolTable::getSymbolAttrName(),
-                             result.attributes))
+  if (parser.parseSymbolName(nameAttr))
     return failure();
 
   Properties &props = result.getOrAddProperties<Properties>();
+  props.setSymName(nameAttr);
 
   // Parse the optional offloadingHandler
   if (succeeded(parser.parseOptionalLess())) {
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 4adb1c1..232d25d 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -2130,6 +2130,106 @@ static void printLevelRange(OpAsmPrinter &p, Operation *, IntegerAttr lvlLo,
   printLevelRange(p, lo, hi);
 }
 
+static ParseResult
+parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
+                     SmallVectorImpl<OpAsmParser::Argument> &iterators,
+                     SmallVectorImpl<OpAsmParser::Argument> &iterArgs) {
+  SmallVector<OpAsmParser::UnresolvedOperand> spaces;
+  SmallVector<OpAsmParser::UnresolvedOperand> initArgs;
+
+  // Parse "%iters, ... in %spaces, ..."
+  if (parser.parseArgumentList(iterators) || parser.parseKeyword("in") ||
+      parser.parseOperandList(spaces))
+    return failure();
+
+  if (iterators.size() != spaces.size())
+    return parser.emitError(
+        parser.getNameLoc(),
+        "mismatch in number of sparse iterators and sparse spaces");
+
+  // Parse "at(%crd0, _, ...)"
+  LevelSet crdUsedLvlSet;
+  bool hasUsedCrds = succeeded(parser.parseOptionalKeyword("at"));
+  unsigned lvlCrdCnt = 0;
+  if (hasUsedCrds) {
+    ParseResult crdList = parser.parseCommaSeparatedList(
+        OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
+          if (parser.parseOptionalKeyword("_")) {
+            if (parser.parseArgument(iterArgs.emplace_back()))
+              return failure();
+            // Always use IndexType for the coordinate.
+            crdUsedLvlSet.set(lvlCrdCnt);
+            iterArgs.back().type = parser.getBuilder().getIndexType();
+          }
+          lvlCrdCnt += 1;
+          return success();
+        });
+    if (failed(crdList)) {
+      return parser.emitError(
+          parser.getNameLoc(),
+          "expecting SSA value or \"_\" for level coordinates");
+    }
+  }
+  // Set the CrdUsedLvl bitset.
+  state.addAttribute("crdUsedLvls",
+                     parser.getBuilder().getI64IntegerAttr(crdUsedLvlSet));
+
+  // Parse "iter_args(%arg = %init, ...)"
+  bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args"));
+  if (hasIterArgs)
+    if (parser.parseAssignmentList(iterArgs, initArgs))
+      return failure();
+
+  SmallVector<Type> iterSpaceTps;
+  // parse ": sparse_tensor.iter_space -> ret"
+  if (parser.parseColon() || parser.parseTypeList(iterSpaceTps))
+    return failure();
+  if (iterSpaceTps.size() != spaces.size())
+    return parser.emitError(parser.getNameLoc(),
+                            "mismatch in number of iteration space operands "
+                            "and iteration space types");
+
+  for (auto [it, tp] : llvm::zip_equal(iterators, iterSpaceTps)) {
+    IterSpaceType spaceTp = llvm::dyn_cast<IterSpaceType>(tp);
+    if (!spaceTp)
+      return parser.emitError(parser.getNameLoc(),
+                              "expected sparse_tensor.iter_space type for "
+                              "iteration space operands");
+    if (hasUsedCrds && spaceTp.getSpaceDim() != lvlCrdCnt)
+      return parser.emitError(parser.getNameLoc(),
+                              "mismatch in number of iteration space dimension "
+                              "and specified coordinates");
+    it.type = spaceTp.getIteratorType();
+  }
+
+  if (hasIterArgs)
+    if (parser.parseArrowTypeList(state.types))
+      return failure();
+
+  // Resolves input operands.
+  if (parser.resolveOperands(spaces, iterSpaceTps, parser.getNameLoc(),
+                             state.operands))
+    return failure();
+
+  if (hasIterArgs) {
+    unsigned numCrds = crdUsedLvlSet.count();
+    // Strip off leading args that used for coordinates.
+    MutableArrayRef args = MutableArrayRef(iterArgs).drop_front(numCrds);
+    if (args.size() != initArgs.size() || args.size() != state.types.size()) {
+      return parser.emitError(
+          parser.getNameLoc(),
+          "mismatch in number of iteration arguments and return values");
+    }
+
+    for (auto [it, init, tp] : llvm::zip_equal(args, initArgs, state.types)) {
+      it.type = tp;
+      if (parser.resolveOperand(init, tp, state.operands))
+        return failure();
+    }
+  }
+  return success();
+}
+
 LogicalResult ExtractIterSpaceOp::inferReturnTypes(
     MLIRContext *ctx, std::optional<Location> loc, ValueRange ops,
     DictionaryAttr attr, OpaqueProperties prop, RegionRange region,
@@ -2153,7 +2253,7 @@ LogicalResult ExtractIterSpaceOp::verify() {
   }
 
   if (pIter) {
-    IterSpaceType spaceTp = getResultSpace().getType();
+    IterSpaceType spaceTp = getExtractedSpace().getType();
     if (pIter.getType().getEncoding() != spaceTp.getEncoding())
       return emitOpError(
           "mismatch in parent iterator encoding and iteration space encoding.");
@@ -2166,6 +2266,161 @@ LogicalResult ExtractIterSpaceOp::verify() {
   return success();
 }
 
+ParseResult IterateOp::parse(OpAsmParser &parser, OperationState &result) {
+  OpAsmParser::Argument iterator;
+  OpAsmParser::UnresolvedOperand iterSpace;
+
+  SmallVector<OpAsmParser::Argument> iters, iterArgs;
+  if (parseSparseSpaceLoop(parser, result, iters, iterArgs))
+    return failure();
+  if (iters.size() != 1)
+    return parser.emitError(parser.getNameLoc(),
+                            "expected only one iterator/iteration space");
+
+  iters.append(iterArgs);
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, iters))
+    return failure();
+
+  IterateOp::ensureTerminator(*body, parser.getBuilder(), result.location);
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+/// Prints the initialization list in the form of
+///   <prefix>(%inner = %outer, %inner2 = %outer2, <...>)
+/// where 'inner' values are assumed to be region arguments and 'outer' values
+/// are regular SSA values.
+static void printInitializationList(OpAsmPrinter &p,
+                                    Block::BlockArgListType blocksArgs,
+                                    ValueRange initializers,
+                                    StringRef prefix = "") {
+  assert(blocksArgs.size() == initializers.size() &&
+         "expected same length of arguments and initializers");
+  if (initializers.empty())
+    return;
+
+  p << prefix << '(';
+  llvm::interleaveComma(llvm::zip(blocksArgs, initializers), p, [&](auto it) {
+    p << std::get<0>(it) << " = " << std::get<1>(it);
+  });
+  p << ")";
+}
+
+static void printUsedCrdsList(OpAsmPrinter &p, unsigned spaceDim,
+                              Block::BlockArgListType blocksArgs,
+                              LevelSet crdUsedLvls) {
+  if (crdUsedLvls.empty())
+    return;
+
+  p << " at(";
+  for (unsigned i = 0; i < spaceDim; i++) {
+    if (crdUsedLvls[i]) {
+      p << blocksArgs.front();
+      blocksArgs = blocksArgs.drop_front();
+    } else {
+      p << "_";
+    }
+    if (i != spaceDim - 1)
+      p << ", ";
+  }
+  assert(blocksArgs.empty());
+  p << ")";
+}
+
+void IterateOp::print(OpAsmPrinter &p) {
+  p << " " << getIterator() << " in " << getIterSpace();
+  printUsedCrdsList(p, getSpaceDim(), getCrds(), getCrdUsedLvls());
+  printInitializationList(p, getRegionIterArgs(), getInitArgs(), " iter_args");
+
+  p << " : " << getIterSpace().getType() << " ";
+  if (!getInitArgs().empty())
+    p << "-> (" << getInitArgs().getTypes() << ") ";
+
+  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/!getInitArgs().empty());
+}
+
+LogicalResult IterateOp::verify() {
+  if (getInitArgs().size() != getNumResults()) {
+    return emitOpError(
+        "mismatch in number of loop-carried values and defined values");
+  }
+  return success();
+}
+
+LogicalResult IterateOp::verifyRegions() {
+  if (getIterator().getType() != getIterSpace().getType().getIteratorType())
+    return emitOpError("mismatch in iterator and iteration space type");
+  if (getNumRegionIterArgs() != getNumResults())
+    return emitOpError(
+        "mismatch in number of basic block args and defined values");
+
+  auto initArgs = getInitArgs();
+  auto iterArgs = getRegionIterArgs();
+  auto yieldVals = getYieldedValues();
+  auto opResults = getResults();
+  if (!llvm::all_equal({initArgs.size(), iterArgs.size(), yieldVals.size(),
+                        opResults.size()})) {
+    return emitOpError() << "number mismatch between iter args and results.";
+  }
+
+  for (auto [i, init, iter, yield, ret] :
+       llvm::enumerate(initArgs, iterArgs, yieldVals, opResults)) {
+    if (init.getType() != ret.getType())
+      return emitOpError() << "types mismatch between " << i
+                           << "th iter operand and defined value";
+    if (iter.getType() != ret.getType())
+      return emitOpError() << "types mismatch between " << i
+                           << "th iter region arg and defined value";
+    if (yield.getType() != ret.getType())
+      return emitOpError() << "types mismatch between " << i
+                           << "th yield value and defined value";
+  }
+
+  return success();
+}
+
+/// OpInterfaces' methods implemented by IterateOp.
+SmallVector<Region *> IterateOp::getLoopRegions() { return {&getRegion()}; }
+
+MutableArrayRef<OpOperand> IterateOp::getInitsMutable() {
+  return getInitArgsMutable();
+}
+
+Block::BlockArgListType IterateOp::getRegionIterArgs() {
+  return getRegion().getArguments().take_back(getNumRegionIterArgs());
+}
+
+std::optional<MutableArrayRef<OpOperand>> IterateOp::getYieldedValuesMutable() {
+  return cast<sparse_tensor::YieldOp>(
+             getRegion().getBlocks().front().getTerminator())
+      .getResultsMutable();
+}
+
+std::optional<ResultRange> IterateOp::getLoopResults() { return getResults(); }
+
+OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+  return getInitArgs();
+}
+
+void IterateOp::getSuccessorRegions(RegionBranchPoint point,
+                                    SmallVectorImpl<RegionSuccessor> &regions) {
+  // Both the operation itself and the region may be branching into the body or
+  // back into the operation itself.
+  regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
+  // It is possible for loop not to enter the body.
+  regions.push_back(RegionSuccessor(getResults()));
+}
+
+//===----------------------------------------------------------------------===//
+// Sparse Tensor Dialect Setups.
+//===----------------------------------------------------------------------===//
+
 /// Materialize a single constant operation from a given attribute value with
 /// the desired resultant type.
 Operation *SparseTensorDialect::materializeConstant(OpBuilder &builder,
diff --git a/mlir/lib/Query/QueryParser.cpp b/mlir/lib/Query/QueryParser.cpp
index 595055a..8a03463 100644
--- a/mlir/lib/Query/QueryParser.cpp
+++ b/mlir/lib/Query/QueryParser.cpp
@@ -91,13 +91,11 @@ QueryRef QueryParser::endQuery(QueryRef queryRef) {
   llvm::StringRef extra = line;
   llvm::StringRef extraTrimmed = extra.ltrim(" \t\v\f\r");
 
-  if ((!extraTrimmed.empty() && extraTrimmed[0] == '\n') ||
-      (extraTrimmed.size() >= 2 && extraTrimmed[0] == '\r' &&
-       extraTrimmed[1] == '\n'))
+  if (extraTrimmed.starts_with('\n') || extraTrimmed.starts_with("\r\n"))
     queryRef->remainingContent = extra;
   else {
     llvm::StringRef trailingWord = lexWord();
-    if (!trailingWord.empty() && trailingWord.front() == '#') {
+    if (trailingWord.starts_with('#')) {
       line = line.drop_until([](char c) { return c == '\n'; });
       line = line.drop_while([](char c) { return c == '\n'; });
       return endQuery(queryRef);
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index cfd4f9c0..597cb29 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -319,8 +319,7 @@ private:
 /// This abstract class manages the worklist and contains helper methods for
 /// rewriting ops on the worklist. Derived classes specify how ops are added
 /// to the worklist in the beginning.
-class GreedyPatternRewriteDriver : public PatternRewriter,
-                                   public RewriterBase::Listener {
+class GreedyPatternRewriteDriver : public RewriterBase::Listener {
 protected:
   explicit GreedyPatternRewriteDriver(MLIRContext *ctx,
                                       const FrozenRewritePatternSet &patterns,
@@ -339,7 +338,8 @@ protected:
   /// Notify the driver that the specified operation was inserted. Update the
   /// worklist as needed: The operation is enqueued depending on scope and
   /// strict mode.
-  void notifyOperationInserted(Operation *op, InsertPoint previous) override;
+  void notifyOperationInserted(Operation *op,
+                               OpBuilder::InsertPoint previous) override;
 
   /// Notify the driver that the specified operation was removed. Update the
   /// worklist as needed: The operation and its children are removed from the
@@ -354,6 +354,10 @@ protected:
   /// reached. Return `true` if any IR was changed.
   bool processWorklist();
 
+  /// The pattern rewriter that is used for making IR modifications and is
+  /// passed to rewrite patterns.
+  PatternRewriter rewriter;
+
   /// The worklist for this transformation keeps track of the operations that
   /// need to be (re)visited.
 #ifdef MLIR_GREEDY_REWRITE_RANDOMIZER_SEED
@@ -407,7 +411,7 @@ private:
 GreedyPatternRewriteDriver::GreedyPatternRewriteDriver(
     MLIRContext *ctx, const FrozenRewritePatternSet &patterns,
     const GreedyRewriteConfig &config)
-    : PatternRewriter(ctx), config(config), matcher(patterns)
+    : rewriter(ctx), config(config), matcher(patterns)
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       // clang-format off
       , expensiveChecks(
@@ -423,9 +427,9 @@ GreedyPatternRewriteDriver::GreedyPatternRewriteDriver(
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
   // Send IR notifications to the debug handler. This handler will then forward
   // all notifications to this GreedyPatternRewriteDriver.
-  setListener(&expensiveChecks);
+  rewriter.setListener(&expensiveChecks);
 #else
-  setListener(this);
+  rewriter.setListener(this);
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
 }
 
@@ -473,7 +477,7 @@ bool GreedyPatternRewriteDriver::processWorklist() {
 
     // If the operation is trivially dead - remove it.
     if (isOpTriviallyDead(op)) {
-      eraseOp(op);
+      rewriter.eraseOp(op);
       changed = true;
 
       LLVM_DEBUG(logResultWithLine("success", "operation is trivially dead"));
@@ -505,8 +509,8 @@ bool GreedyPatternRewriteDriver::processWorklist() {
         // Op results can be replaced with `foldResults`.
         assert(foldResults.size() == op->getNumResults() &&
                "folder produced incorrect number of results");
-        OpBuilder::InsertionGuard g(*this);
-        setInsertionPoint(op);
+        OpBuilder::InsertionGuard g(rewriter);
+        rewriter.setInsertionPoint(op);
         SmallVector<Value> replacements;
         bool materializationSucceeded = true;
         for (auto [ofr, resultType] :
@@ -519,7 +523,7 @@ bool GreedyPatternRewriteDriver::processWorklist() {
           }
           // Materialize Attributes as SSA values.
           Operation *constOp = op->getDialect()->materializeConstant(
-              *this, ofr.get<Attribute>(), resultType, op->getLoc());
+              rewriter, ofr.get<Attribute>(), resultType, op->getLoc());
 
           if (!constOp) {
             // If materialization fails, cleanup any operations generated for
@@ -532,7 +536,7 @@ bool GreedyPatternRewriteDriver::processWorklist() {
               replacementOps.insert(replacement.getDefiningOp());
             }
             for (Operation *op : replacementOps) {
-              eraseOp(op);
+              rewriter.eraseOp(op);
             }
 
             materializationSucceeded = false;
@@ -547,7 +551,7 @@ bool GreedyPatternRewriteDriver::processWorklist() {
         }
 
         if (materializationSucceeded) {
-          replaceOp(op, replacements);
+          rewriter.replaceOp(op, replacements);
           changed = true;
           LLVM_DEBUG(logSuccessfulFolding(dumpRootOp));
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
@@ -608,7 +612,7 @@ bool GreedyPatternRewriteDriver::processWorklist() {
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
 
     LogicalResult matchResult =
-        matcher.matchAndRewrite(op, *this, canApply, onFailure, onSuccess);
+        matcher.matchAndRewrite(op, rewriter, canApply, onFailure, onSuccess);
 
     if (succeeded(matchResult)) {
       LLVM_DEBUG(logResultWithLine("success", "pattern matched"));
@@ -664,8 +668,8 @@ void GreedyPatternRewriteDriver::notifyBlockErased(Block *block) {
     config.listener->notifyBlockErased(block);
 }
 
-void GreedyPatternRewriteDriver::notifyOperationInserted(Operation *op,
-                                                         InsertPoint previous) {
+void GreedyPatternRewriteDriver::notifyOperationInserted(
+    Operation *op, OpBuilder::InsertPoint previous) {
   LLVM_DEBUG({
     logger.startLine() << "** Insert  : '" << op->getName() << "'(" << op
                        << ")\n";
@@ -822,7 +826,7 @@ private:
 LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && {
   bool continueRewrites = false;
   int64_t iteration = 0;
-  MLIRContext *ctx = getContext();
+  MLIRContext *ctx = rewriter.getContext();
   do {
     // Check if the iteration limit was reached.
     if (++iteration > config.maxIterations &&
@@ -834,7 +838,7 @@ LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && {
 
     // `OperationFolder` CSE's constant ops (and may move them into parents
     // regions to enable more aggressive CSE'ing).
-    OperationFolder folder(getContext(), this);
+    OperationFolder folder(ctx, this);
     auto insertKnownConstant = [&](Operation *op) {
       // Check for existing constants when populating the worklist. This avoids
       // accidentally reversing the constant order during processing.
@@ -872,7 +876,7 @@ LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && {
           // After applying patterns, make sure that the CFG of each of the
           // regions is kept up to date.
           if (config.enableRegionSimplification)
-            continueRewrites |= succeeded(simplifyRegions(*this, region));
+            continueRewrites |= succeeded(simplifyRegions(rewriter, region));
         },
         {&region}, iteration);
   } while (continueRewrites);
diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py
index 80c965b..a9ac765 100644
--- a/mlir/python/mlir/ir.py
+++ b/mlir/python/mlir/ir.py
@@ -68,7 +68,7 @@ def _si1Attr(x, context):
 
 
 @register_attribute_builder("SI8Attr")
-def _i8Attr(x, context):
+def _si8Attr(x, context):
     return IntegerAttr.get(IntegerType.get_signed(8, context=context), x)
 
 
@@ -93,7 +93,7 @@ def _ui1Attr(x, context):
 
 
 @register_attribute_builder("UI8Attr")
-def _i8Attr(x, context):
+def _ui8Attr(x, context):
     return IntegerAttr.get(IntegerType.get_unsigned(8, context=context), x)
 
 
diff --git a/mlir/test/Conversion/FuncToEmitC/func-to-emitc.mlir b/mlir/test/Conversion/FuncToEmitC/func-to-emitc.mlir
index 5c96cf1..5730f7a 100644
--- a/mlir/test/Conversion/FuncToEmitC/func-to-emitc.mlir
+++ b/mlir/test/Conversion/FuncToEmitC/func-to-emitc.mlir
@@ -58,3 +58,19 @@ func.func @call(%arg0: i32) -> i32 {
 
 // CHECK-LABEL: emitc.func private @return_i32(i32) -> i32 attributes {specifiers = ["extern"]}
 func.func private @return_i32(%arg0: i32) -> i32
+
+// -----
+
+// CHECK-LABEL: emitc.func private @return_void() attributes {specifiers = ["static"]}
+// CHECK-NEXT: emitc.return
+func.func private @return_void() {
+  return
+}
+
+// CHECK-LABEL: emitc.func @call()
+// CHECK-NEXT: emitc.call @return_void() : () -> ()
+// CHECK-NEXT: emitc.return
+func.func @call() {
+  call @return_void() : () -> ()
+  return
+}
diff --git a/mlir/test/Conversion/IndexToSPRIV/index-to-spirv.mlir b/mlir/test/Conversion/IndexToSPIRV/index-to-spirv.mlir
index 53dc896..53dc896 100644
--- a/mlir/test/Conversion/IndexToSPRIV/index-to-spirv.mlir
+++ b/mlir/test/Conversion/IndexToSPIRV/index-to-spirv.mlir
diff --git a/mlir/test/Conversion/SCFToEmitC/for.mlir b/mlir/test/Conversion/SCFToEmitC/for.mlir
index 7f90310a..7e59eac 100644
--- a/mlir/test/Conversion/SCFToEmitC/for.mlir
+++ b/mlir/test/Conversion/SCFToEmitC/for.mlir
@@ -49,17 +49,13 @@ func.func @for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> (f32, f32)
 // CHECK-NEXT:    %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK-NEXT:    %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
 // CHECK-NEXT:    %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
-// CHECK-NEXT:    %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
-// CHECK-NEXT:    %[[VAL_8:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
-// CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_7]] : f32
-// CHECK-NEXT:    emitc.assign %[[VAL_4]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : f32
+// CHECK-NEXT:    emitc.assign %[[VAL_4]] : f32 to %[[VAL_6]] : f32
 // CHECK-NEXT:    emitc.for %[[VAL_9:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
-// CHECK-NEXT:      %[[VAL_10:.*]] = arith.addf %[[VAL_7]], %[[VAL_8]] : f32
-// CHECK-NEXT:      emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : f32
-// CHECK-NEXT:      emitc.assign %[[VAL_10]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:      %[[VAL_10:.*]] = arith.addf %[[VAL_5]], %[[VAL_6]] : f32
+// CHECK-NEXT:      emitc.assign %[[VAL_10]] : f32 to %[[VAL_5]] : f32
+// CHECK-NEXT:      emitc.assign %[[VAL_10]] : f32 to %[[VAL_6]] : f32
 // CHECK-NEXT:    }
-// CHECK-NEXT:    emitc.assign %[[VAL_7]] : f32 to %[[VAL_5]] : f32
-// CHECK-NEXT:    emitc.assign %[[VAL_8]] : f32 to %[[VAL_6]] : f32
 // CHECK-NEXT:    return %[[VAL_5]], %[[VAL_6]] : f32, f32
 // CHECK-NEXT:  }
 
@@ -78,19 +74,15 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32
 // CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> f32 {
 // CHECK-NEXT:    %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK-NEXT:    %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
-// CHECK-NEXT:    %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
-// CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : f32
+// CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_4]] : f32
 // CHECK-NEXT:    emitc.for %[[VAL_6:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
 // CHECK-NEXT:      %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
-// CHECK-NEXT:      %[[VAL_8:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
-// CHECK-NEXT:      emitc.assign %[[VAL_5]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:      emitc.assign %[[VAL_4]] : f32 to %[[VAL_7]] : f32
 // CHECK-NEXT:      emitc.for %[[VAL_9:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
-// CHECK-NEXT:        %[[VAL_10:.*]] = arith.addf %[[VAL_8]], %[[VAL_8]] : f32
-// CHECK-NEXT:        emitc.assign %[[VAL_10]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:        %[[VAL_10:.*]] = arith.addf %[[VAL_7]], %[[VAL_7]] : f32
+// CHECK-NEXT:        emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : f32
 // CHECK-NEXT:      }
-// CHECK-NEXT:      emitc.assign %[[VAL_8]] : f32 to %[[VAL_7]] : f32
-// CHECK-NEXT:      emitc.assign %[[VAL_7]] : f32 to %[[VAL_5]] : f32
+// CHECK-NEXT:      emitc.assign %[[VAL_7]] : f32 to %[[VAL_4]] : f32
 // CHECK-NEXT:    }
-// CHECK-NEXT:    emitc.assign %[[VAL_5]] : f32 to %[[VAL_4]] : f32
 // CHECK-NEXT:    return %[[VAL_4]] : f32
 // CHECK-NEXT:  }
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index 60f0ab4..e00b769 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-int-range-inference -canonicalize %s | FileCheck %s
+// RUN: mlir-opt -int-range-optimizations -canonicalize %s | FileCheck %s
 
 // CHECK-LABEL: func @add_min_max
 // CHECK: %[[c3:.*]] = arith.constant 3 : index
diff --git a/mlir/test/Dialect/Arith/int-range-opts.mlir b/mlir/test/Dialect/Arith/int-range-opts.mlir
index dd62a48..ea5969a 100644
--- a/mlir/test/Dialect/Arith/int-range-opts.mlir
+++ b/mlir/test/Dialect/Arith/int-range-opts.mlir
@@ -96,3 +96,39 @@ func.func @test() -> i8 {
   return %1: i8
 }
 
+// -----
+
+// CHECK-LABEL: func @trivial_rem
+// CHECK: [[val:%.+]] = test.with_bounds
+// CHECK: return [[val]]
+func.func @trivial_rem() -> i8 {
+  %c64 = arith.constant 64 : i8
+  %val = test.with_bounds { umin = 0 : ui8, umax = 63 : ui8, smin = 0 : si8, smax = 63 : si8 } : i8
+  %mod = arith.remsi %val, %c64 : i8
+  return %mod : i8
+}
+
+// -----
+
+// CHECK-LABEL: func @non_const_rhs
+// CHECK: [[mod:%.+]] = arith.remui
+// CHECK: return [[mod]]
+func.func @non_const_rhs() -> i8 {
+  %c64 = arith.constant 64 : i8
+  %val = test.with_bounds { umin = 0 : ui8, umax = 2 : ui8, smin = 0 : si8, smax = 2 : si8 } : i8
+  %rhs = test.with_bounds { umin = 63 : ui8, umax = 64 : ui8, smin = 63 : si8, smax = 64 : si8 } : i8
+  %mod = arith.remui %val, %rhs : i8
+  return %mod : i8
+}
+
+// -----
+
+// CHECK-LABEL: func @wraps
+// CHECK: [[mod:%.+]] = arith.remsi
+// CHECK: return [[mod]]
+func.func @wraps() -> i8 {
+  %c64 = arith.constant 64 : i8
+  %val = test.with_bounds { umin = 63 : ui8, umax = 65 : ui8, smin = 63 : si8, smax = 65 : si8 } : i8
+  %mod = arith.remsi %val, %c64 : i8
+  return %mod : i8
+}
diff --git a/mlir/test/Dialect/ArmSME/enable-arm-streaming-invalid.mlir b/mlir/test/Dialect/ArmSME/enable-arm-streaming-invalid.mlir
new file mode 100644
index 0000000..da70b63
--- /dev/null
+++ b/mlir/test/Dialect/ArmSME/enable-arm-streaming-invalid.mlir
@@ -0,0 +1,4 @@
+// RUN: mlir-opt %s -enable-arm-streaming="if-contains-scalable-vectors if-required-by-ops" -verify-diagnostics
+
+// expected-error@below {{enable-arm-streaming: `if-required-by-ops` and `if-contains-scalable-vectors` are mutually exclusive}}
+func.func @test() { return }
diff --git a/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir b/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
index 6b58d8f..2011802 100644
--- a/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
+++ b/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
@@ -2,7 +2,8 @@
 // RUN: mlir-opt %s -enable-arm-streaming=streaming-mode=streaming-locally -verify-diagnostics | FileCheck %s -check-prefix=CHECK-LOCALLY
 // RUN: mlir-opt %s -enable-arm-streaming=streaming-mode=streaming-compatible -verify-diagnostics | FileCheck %s -check-prefix=CHECK-COMPATIBLE
 // RUN: mlir-opt %s -enable-arm-streaming=za-mode=new-za -verify-diagnostics | FileCheck %s -check-prefix=CHECK-ENABLE-ZA
-// RUN: mlir-opt %s -enable-arm-streaming=only-if-required-by-ops -verify-diagnostics | FileCheck %s -check-prefix=IF-REQUIRED
+// RUN: mlir-opt %s -enable-arm-streaming=if-required-by-ops -verify-diagnostics | FileCheck %s -check-prefix=IF-REQUIRED
+// RUN: mlir-opt %s -enable-arm-streaming=if-contains-scalable-vectors -verify-diagnostics | FileCheck %s -check-prefix=IF-SCALABLE
 
 // CHECK-LABEL: @arm_streaming
 // CHECK-SAME: attributes {arm_streaming}
@@ -38,3 +39,17 @@ func.func @requires_arm_streaming() {
 // IF-REQUIRED: @does_not_require_arm_streaming
 // IF-REQUIRED-NOT: arm_streaming
 func.func @does_not_require_arm_streaming() { return }
+
+// IF-SCALABLE-LABEL: @contains_scalable_vectors
+// IF-SCALABLE-SAME: attributes {arm_streaming}
+func.func @contains_scalable_vectors(%vec: vector<[4]xf32>) -> vector<[4]xf32> {
+  %0 = arith.addf %vec, %vec : vector<[4]xf32>
+  return %0 : vector<[4]xf32>
+}
+
+// IF-SCALABLE-LABEL: @no_scalable_vectors
+// IF-SCALABLE-NOT: arm_streaming
+func.func @no_scalable_vectors(%vec: vector<4xf32>) -> vector<4xf32> {
+  %0 = arith.addf %vec, %vec : vector<4xf32>
+  return %0 : vector<4xf32>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-callop-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-callop-interface.mlir
index 6394715..a77442d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-callop-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-callop-interface.mlir
@@ -131,3 +131,22 @@ func.func @g(%arg0: memref<f32>) -> memref<f32> {
 // CHECK-DYNAMIC-LABEL: func private @f
 //  CHECK-DYNAMIC-SAME: (memref<f32>) -> memref<f32>
 //       CHECK-DYNAMIC: call @f({{.*}}) : (memref<f32>) -> memref<f32>
+
+// -----
+
+func.func @func_call_indirect(%m: memref<?xf32>, %f: (memref<?xf32>) -> (memref<?xf32>)) {
+  %0 = func.call_indirect %f(%m) : (memref<?xf32>) -> (memref<?xf32>)
+  return
+}
+
+// CHECK-LABEL: func @func_call_indirect(
+//       CHECK:   %[[true:.*]] = arith.constant true
+//       CHECK:   %[[call:.*]] = call_indirect {{.*}} : (memref<?xf32>) -> memref<?xf32>
+//       CHECK:   %[[base_call:.*]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata %[[call]]
+//       CHECK:   bufferization.dealloc (%[[base_call]] : {{.*}}) if (%[[true]])
+
+// CHECK-DYNAMIC-LABEL: func @func_call_indirect(
+//       CHECK-DYNAMIC:   %[[true:.*]] = arith.constant true
+//       CHECK-DYNAMIC:   %[[call:.*]] = call_indirect {{.*}} : (memref<?xf32>) -> memref<?xf32>
+//       CHECK-DYNAMIC:   %[[base_call:.*]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata %[[call]]
+//       CHECK-DYNAMIC:   bufferization.dealloc (%[[base_call]] : {{.*}}) if (%[[true]])
diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index 980f7e5..a0917a2f 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-int-range-inference -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -int-range-optimizations -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: func @launch_func
 func.func @launch_func(%arg0 : index) {
diff --git a/mlir/test/Dialect/Index/int-range-inference.mlir b/mlir/test/Dialect/Index/int-range-inference.mlir
index 2784d5f..951624d 100644
--- a/mlir/test/Dialect/Index/int-range-inference.mlir
+++ b/mlir/test/Dialect/Index/int-range-inference.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-int-range-inference -canonicalize %s | FileCheck %s
+// RUN: mlir-opt -int-range-optimizations -canonicalize %s | FileCheck %s
 
 // Most operations are covered by the `arith` tests, which use the same code
 // Here, we add a few tests to ensure the "index can be 32- or 64-bit" handling
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index 3fa696e..eb0dc01 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -1025,6 +1025,7 @@ func.func @sparse_print(%arg0: tensor<10x10xf64>) {
 func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 2>) {
   // expected-error@+1 {{'sparse_tensor.extract_iteration_space' expect larger level upper bound than lower bound}}
   %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 to 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 2>
+                                                                       -> !sparse_tensor.iter_space<#COO, lvls = 0 to 2>
   return
 }
 
@@ -1040,6 +1041,7 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse
 func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) {
   // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}}
   %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+                                                                  -> !sparse_tensor.iter_space<#COO, lvls = 1>
   return
 }
 
@@ -1054,7 +1056,7 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse
 
 func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>) {
   // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}}
-  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 1 : tensor<4x8xf32, #COO>
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 1 : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 1>
   return
 }
 
@@ -1077,6 +1079,7 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>) {
 func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#CSR, lvls = 0>) {
   // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op mismatch in parent iterator encoding and iteration space encoding.}}
   %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#CSR, lvls = 0>
+                                                                 -> !sparse_tensor.iter_space<#COO, lvls = 1>
   return
 }
 
@@ -1092,5 +1095,63 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse
 func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) {
   // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be used to extract an iteration space from a consecutive level.}}
   %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+                                                                  -> !sparse_tensor.iter_space<#COO, lvls = 2>
   return
 }
+
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index {
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0>
+  // expected-error @+1 {{'sparse_tensor.iterate' op different number of region iter_args and yielded values: 2 != 1}}
+  %r1, %r2 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%si = %i, %sj = %j): !sparse_tensor.iter_space<#COO, lvls = 0> -> (index, index) {
+    sparse_tensor.yield %si : index
+  }
+  return %r1 : index
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+// expected-note@+1 {{prior use here}}
+func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index) -> f32 {
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0>
+  // expected-error @+1 {{use of value '%i' expects different type than prior uses: 'f32' vs 'index'}}
+  %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0> -> f32 {
+    sparse_tensor.yield %outer : f32
+  }
+  return %r1 : f32
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index {
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0>
+  // expected-error @+1 {{'sparse_tensor.iterate' op 0-th region iter_arg and 0-th yielded value have different type: 'index' != 'f32'}}
+  %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%si = %i): !sparse_tensor.iter_space<#COO, lvls = 0> -> index {
+    %y = arith.constant 1.0 :  f32
+    sparse_tensor.yield %y : f32
+  }
+  return %r1 : index
+}
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index d340712..bce0b41a 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -758,8 +758,37 @@ func.func @sparse_has_runtime() -> i1 {
 func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>)
   -> (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>) {
   // Extracting the iteration space for the first level needs no parent iterator.
-  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO>
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0>
   // Extracting the iteration space for the second level needs a parent iterator.
   %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+                                                                 -> !sparse_tensor.iter_space<#COO, lvls = 1>
   return %l1, %l2 : !sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>
 }
+
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+// CHECK-LABEL:   func.func @sparse_iterate(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>,
+// CHECK-SAME:      %[[VAL_1:.*]]: index,
+// CHECK-SAME:      %[[VAL_2:.*]]: index) -> index {
+// CHECK:           %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 : tensor<4x8xf32, #sparse{{[0-9]*}}>
+// CHECK:           %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]]) iter_args(%[[VAL_7:.*]] = %[[VAL_1]]) : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> -> (index) {
+// CHECK:             sparse_tensor.yield %[[VAL_7]] : index
+// CHECK:           }
+// CHECK:           return %[[VAL_4]] : index
+// CHECK:         }
+func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index {
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0>
+  %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0 to 1> -> index {
+    sparse_tensor.yield %outer : index
+  }
+  return %r1 : index
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir b/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir
new file mode 100644
index 0000000..f70fab3
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
+
+#CSR = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : dense,
+    j : compressed
+  )
+}>
+
+// Make sure that pure instructions are hoisted outside the loop.
+//
+// CHECK: sparse_tensor.values
+// CHECK: sparse_tensor.positions
+// CHECK: sparse_tensor.coordinate
+// CHECK: sparse_tensor.iterate
+func.func @sparse_iterate(%sp : tensor<?x?xf64, #CSR>) {
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<?x?xf64, #CSR>
+                                                         -> !sparse_tensor.iter_space<#CSR, lvls = 0>
+  sparse_tensor.iterate %it1 in %l1 at (%crd) : !sparse_tensor.iter_space<#CSR, lvls = 0> {
+    %0 = sparse_tensor.values %sp : tensor<?x?xf64, #CSR> to memref<?xf64>
+    %1 = sparse_tensor.positions %sp { level = 1 : index } : tensor<?x?xf64, #CSR> to memref<?xindex>
+    %2 = sparse_tensor.coordinates  %sp { level = 1 : index } : tensor<?x?xf64, #CSR> to memref<?xindex>
+    "test.op"(%0, %1, %2) : (memref<?xf64>, memref<?xindex>, memref<?xindex>) -> ()
+  }
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
index 10ffed2..aabd9d2 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
@@ -4,7 +4,7 @@
 // RUN:   -arm-sme-vector-legalization -canonicalize -cse \
 // RUN:   -convert-vector-to-arm-sme -arm-sme-outer-product-fusion \
 // RUN:   -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
-// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
+// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za if-required-by-ops" \
 // RUN:   -convert-vector-to-scf=full-unroll -convert-arm-sme-to-llvm \
 // RUN:   -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
diff --git a/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir b/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir
index 2106eee..1ec3441 100644
--- a/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir
+++ b/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-int-range-inference %s | FileCheck %s
+// RUN: mlir-opt -int-range-optimizations %s | FileCheck %s
 
 // CHECK-LABEL: func @constant
 // CHECK: %[[cst:.*]] = "test.constant"() <{value = 3 : index}
@@ -103,13 +103,11 @@ func.func @func_args_unbound(%arg0 : index) -> index {
 
 // CHECK-LABEL: func @propagate_across_while_loop_false()
 func.func @propagate_across_while_loop_false() -> index {
-  // CHECK-DAG: %[[C0:.*]] = "test.constant"() <{value = 0
-  // CHECK-DAG: %[[C1:.*]] = "test.constant"() <{value = 1
+  // CHECK: %[[C1:.*]] = "test.constant"() <{value = 1
   %0 = test.with_bounds { umin = 0 : index, umax = 0 : index,
                           smin = 0 : index, smax = 0 : index } : index
   %1 = scf.while : () -> index {
     %false = arith.constant false
-    // CHECK: scf.condition(%{{.*}}) %[[C0]]
     scf.condition(%false) %0 : index
   } do {
   ^bb0(%i1: index):
@@ -122,12 +120,10 @@ func.func @propagate_across_while_loop_false() -> index {
 
 // CHECK-LABEL: func @propagate_across_while_loop
 func.func @propagate_across_while_loop(%arg0 : i1) -> index {
-  // CHECK-DAG: %[[C0:.*]] = "test.constant"() <{value = 0
-  // CHECK-DAG: %[[C1:.*]] = "test.constant"() <{value = 1
+  // CHECK: %[[C1:.*]] = "test.constant"() <{value = 1
   %0 = test.with_bounds { umin = 0 : index, umax = 0 : index,
                           smin = 0 : index, smax = 0 : index } : index
   %1 = scf.while : () -> index {
-    // CHECK: scf.condition(%{{.*}}) %[[C0]]
     scf.condition(%arg0) %0 : index
   } do {
   ^bb0(%i1: index):
diff --git a/mlir/test/Target/Cpp/for.mlir b/mlir/test/Target/Cpp/for.mlir
index 60988bc..af1d829 100644
--- a/mlir/test/Target/Cpp/for.mlir
+++ b/mlir/test/Target/Cpp/for.mlir
@@ -40,8 +40,6 @@ func.func @test_for_yield() {
   %s0 = "emitc.constant"() <{value = 0 : i32}> : () -> i32
   %p0 = "emitc.constant"() <{value = 1.0 : f32}> : () -> f32
 
-  %0 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
-  %1 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
   %2 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
   %3 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
   emitc.assign %s0 : i32 to %2 : i32
@@ -53,8 +51,6 @@ func.func @test_for_yield() {
     emitc.assign %pn : f32 to %3 : f32
     emitc.yield
   }
-  emitc.assign %2 : i32 to %0 : i32
-  emitc.assign %3 : f32 to %1 : f32
 
   return
 }
@@ -64,8 +60,6 @@ func.func @test_for_yield() {
 // CPP-DEFAULT-NEXT: size_t [[STEP:[^ ]*]] = 1;
 // CPP-DEFAULT-NEXT: int32_t [[S0:[^ ]*]] = 0;
 // CPP-DEFAULT-NEXT: float [[P0:[^ ]*]] = 1.000000000e+00f;
-// CPP-DEFAULT-NEXT: int32_t [[SE:[^ ]*]];
-// CPP-DEFAULT-NEXT: float [[PE:[^ ]*]];
 // CPP-DEFAULT-NEXT: int32_t [[SI:[^ ]*]];
 // CPP-DEFAULT-NEXT: float [[PI:[^ ]*]];
 // CPP-DEFAULT-NEXT: [[SI:[^ ]*]] = [[S0]];
@@ -76,8 +70,6 @@ func.func @test_for_yield() {
 // CPP-DEFAULT-NEXT: [[SI]] = [[SN]];
 // CPP-DEFAULT-NEXT: [[PI]] = [[PN]];
 // CPP-DEFAULT-NEXT: }
-// CPP-DEFAULT-NEXT: [[SE]] = [[SI]];
-// CPP-DEFAULT-NEXT: [[PE]] = [[PI]];
 // CPP-DEFAULT-NEXT: return;
 
 // CPP-DECLTOP: void test_for_yield() {
@@ -86,8 +78,6 @@ func.func @test_for_yield() {
 // CPP-DECLTOP-NEXT: size_t [[STEP:[^ ]*]];
 // CPP-DECLTOP-NEXT: int32_t [[S0:[^ ]*]];
 // CPP-DECLTOP-NEXT: float [[P0:[^ ]*]];
-// CPP-DECLTOP-NEXT: int32_t [[SE:[^ ]*]];
-// CPP-DECLTOP-NEXT: float [[PE:[^ ]*]];
 // CPP-DECLTOP-NEXT: int32_t [[SI:[^ ]*]];
 // CPP-DECLTOP-NEXT: float [[PI:[^ ]*]];
 // CPP-DECLTOP-NEXT: int32_t [[SN:[^ ]*]];
@@ -99,8 +89,6 @@ func.func @test_for_yield() {
 // CPP-DECLTOP-NEXT: [[P0]] = 1.000000000e+00f;
 // CPP-DECLTOP-NEXT: ;
 // CPP-DECLTOP-NEXT: ;
-// CPP-DECLTOP-NEXT: ;
-// CPP-DECLTOP-NEXT: ;
 // CPP-DECLTOP-NEXT: [[SI:[^ ]*]] = [[S0]];
 // CPP-DECLTOP-NEXT: [[PI:[^ ]*]] = [[P0]];
 // CPP-DECLTOP-NEXT: for (size_t [[ITER:[^ ]*]] = [[START]]; [[ITER]] < [[STOP]]; [[ITER]] += [[STEP]]) {
@@ -109,8 +97,6 @@ func.func @test_for_yield() {
 // CPP-DECLTOP-NEXT: [[SI]] = [[SN]];
 // CPP-DECLTOP-NEXT: [[PI]] = [[PN]];
 // CPP-DECLTOP-NEXT: }
-// CPP-DECLTOP-NEXT: [[SE]] = [[SI]];
-// CPP-DECLTOP-NEXT: [[PE]] = [[PI]];
 // CPP-DECLTOP-NEXT: return;
 
 func.func @test_for_yield_2() {
@@ -121,8 +107,6 @@ func.func @test_for_yield_2() {
   %s0 = emitc.literal "0" : i32
   %p0 = emitc.literal "M_PI" : f32
 
-  %0 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
-  %1 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
   %2 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
   %3 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
   emitc.assign %s0 : i32 to %2 : i32
@@ -134,8 +118,6 @@ func.func @test_for_yield_2() {
     emitc.assign %pn : f32 to %3 : f32
     emitc.yield
   }
-  emitc.assign %2 : i32 to %0 : i32
-  emitc.assign %3 : f32 to %1 : f32
 
   return
 }
diff --git a/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp b/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp
index d3dabaf..a220791 100644
--- a/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp
+++ b/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp
@@ -74,7 +74,7 @@ void buildTestLowerToArmSME(OpPassManager &pm,
   // Enable streaming-mode and ZA.
   pm.addPass(arm_sme::createEnableArmStreamingPass(
       arm_sme::ArmStreamingMode::StreamingLocally, arm_sme::ArmZaMode::NewZA,
-      /*onlyIfRequiredByOps=*/true));
+      /*ifRequiredByOps=*/true));
 
   // Convert SCF to CF (required for ArmSME tile allocation).
   pm.addPass(createConvertSCFToCFPass());
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 975a41a..66b1faf 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -24,7 +24,6 @@ add_mlir_library(MLIRTestTransforms
   TestConstantFold.cpp
   TestControlFlowSink.cpp
   TestInlining.cpp
-  TestIntRangeInference.cpp
   TestMakeIsolatedFromAbove.cpp
   ${MLIRTestTransformsPDLSrc}
 
diff --git a/mlir/test/lib/Transforms/TestIntRangeInference.cpp b/mlir/test/lib/Transforms/TestIntRangeInference.cpp
deleted file mode 100644
index 5758f6a..0000000
--- a/mlir/test/lib/Transforms/TestIntRangeInference.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-//===- TestIntRangeInference.cpp - Create consts from range inference ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// TODO: This pass is needed to test integer range inference until that
-// functionality has been integrated into SCCP.
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
-#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
-#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Support/TypeID.h"
-#include "mlir/Transforms/FoldUtils.h"
-#include <optional>
-
-using namespace mlir;
-using namespace mlir::dataflow;
-
-/// Patterned after SCCP
-static LogicalResult replaceWithConstant(DataFlowSolver &solver, OpBuilder &b,
-                                         OperationFolder &folder, Value value) {
-  auto *maybeInferredRange =
-      solver.lookupState<IntegerValueRangeLattice>(value);
-  if (!maybeInferredRange || maybeInferredRange->getValue().isUninitialized())
-    return failure();
-  const ConstantIntRanges &inferredRange =
-      maybeInferredRange->getValue().getValue();
-  std::optional<APInt> maybeConstValue = inferredRange.getConstantValue();
-  if (!maybeConstValue.has_value())
-    return failure();
-
-  Operation *maybeDefiningOp = value.getDefiningOp();
-  Dialect *valueDialect =
-      maybeDefiningOp ? maybeDefiningOp->getDialect()
-                      : value.getParentRegion()->getParentOp()->getDialect();
-  Attribute constAttr = b.getIntegerAttr(value.getType(), *maybeConstValue);
-  Value constant = folder.getOrCreateConstant(
-      b.getInsertionBlock(), valueDialect, constAttr, value.getType());
-  if (!constant)
-    return failure();
-
-  value.replaceAllUsesWith(constant);
-  return success();
-}
-
-static void rewrite(DataFlowSolver &solver, MLIRContext *context,
-                    MutableArrayRef<Region> initialRegions) {
-  SmallVector<Block *> worklist;
-  auto addToWorklist = [&](MutableArrayRef<Region> regions) {
-    for (Region &region : regions)
-      for (Block &block : llvm::reverse(region))
-        worklist.push_back(&block);
-  };
-
-  OpBuilder builder(context);
-  OperationFolder folder(context);
-
-  addToWorklist(initialRegions);
-  while (!worklist.empty()) {
-    Block *block = worklist.pop_back_val();
-
-    for (Operation &op : llvm::make_early_inc_range(*block)) {
-      builder.setInsertionPoint(&op);
-
-      // Replace any result with constants.
-      bool replacedAll = op.getNumResults() != 0;
-      for (Value res : op.getResults())
-        replacedAll &=
-            succeeded(replaceWithConstant(solver, builder, folder, res));
-
-      // If all of the results of the operation were replaced, try to erase
-      // the operation completely.
-      if (replacedAll && wouldOpBeTriviallyDead(&op)) {
-        assert(op.use_empty() && "expected all uses to be replaced");
-        op.erase();
-        continue;
-      }
-
-      // Add any the regions of this operation to the worklist.
-      addToWorklist(op.getRegions());
-    }
-
-    // Replace any block arguments with constants.
-    builder.setInsertionPointToStart(block);
-    for (BlockArgument arg : block->getArguments())
-      (void)replaceWithConstant(solver, builder, folder, arg);
-  }
-}
-
-namespace {
-struct TestIntRangeInference
-    : PassWrapper<TestIntRangeInference, OperationPass<>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestIntRangeInference)
-
-  StringRef getArgument() const final { return "test-int-range-inference"; }
-  StringRef getDescription() const final {
-    return "Test integer range inference analysis";
-  }
-
-  void runOnOperation() override {
-    Operation *op = getOperation();
-    DataFlowSolver solver;
-    solver.load<DeadCodeAnalysis>();
-    solver.load<SparseConstantPropagation>();
-    solver.load<IntegerRangeAnalysis>();
-    if (failed(solver.initializeAndRun(op)))
-      return signalPassFailure();
-    rewrite(solver, op->getContext(), op->getRegions());
-  }
-};
-} // end anonymous namespace
-
-namespace mlir {
-namespace test {
-void registerTestIntRangeInference() {
-  PassRegistration<TestIntRangeInference>();
-}
-} // end namespace test
-} // end namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 0e8b161..d0de74d 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -32,21 +32,21 @@ using namespace mlir;
 
 // Defined in the test directory, no public header.
 namespace mlir {
-void registerConvertToTargetEnvPass();
 void registerCloneTestPasses();
+void registerConvertToTargetEnvPass();
 void registerLazyLoadingTestPasses();
+void registerLoopLikeInterfaceTestPasses();
 void registerPassManagerTestPass();
 void registerPrintSpirvAvailabilityPass();
-void registerLoopLikeInterfaceTestPasses();
+void registerRegionTestPasses();
 void registerShapeFunctionTestPasses();
 void registerSideEffectTestPasses();
 void registerSliceAnalysisTestPass();
 void registerSymbolTestPasses();
-void registerRegionTestPasses();
-void registerTestAffineDataCopyPass();
 void registerTestAffineAccessAnalysisPass();
-void registerTestAffineReifyValueBoundsPass();
+void registerTestAffineDataCopyPass();
 void registerTestAffineLoopUnswitchingPass();
+void registerTestAffineReifyValueBoundsPass();
 void registerTestAffineWalk();
 void registerTestBytecodeRoundtripPasses();
 void registerTestDecomposeAffineOpPass();
@@ -56,10 +56,10 @@ void registerTestGpuMemoryPromotionPass();
 void registerTestLoopPermutationPass();
 void registerTestMatchers();
 void registerTestOperationEqualPass();
+void registerTestPreserveUseListOrders();
 void registerTestPrintDefUsePass();
 void registerTestPrintInvalidPass();
 void registerTestPrintNestingPass();
-void registerTestPreserveUseListOrders();
 void registerTestReducer();
 void registerTestSpirvEntryPointABIPass();
 void registerTestSpirvModuleCombinerPass();
@@ -68,7 +68,6 @@ void registerTosaTestQuantUtilAPIPass();
 void registerVectorizerTestPass();
 
 namespace test {
-void registerTestCompositePass();
 void registerCommutativityUtils();
 void registerConvertCallOpPass();
 void registerConvertFuncOpPass();
@@ -77,12 +76,15 @@ void registerMemRefBoundCheck();
 void registerPatternsTestPass();
 void registerSimpleParametricTilingPass();
 void registerTestAffineLoopParametricTilingPass();
-void registerTestArithEmulateWideIntPass();
 void registerTestAliasAnalysisPass();
+void registerTestArithEmulateWideIntPass();
 void registerTestBuiltinAttributeInterfaces();
 void registerTestBuiltinDistinctAttributes();
 void registerTestCallGraphPass();
 void registerTestCfAssertPass();
+void registerTestCFGLoopInfoPass();
+void registerTestComposeSubView();
+void registerTestCompositePass();
 void registerTestConstantFold();
 void registerTestControlFlowSink();
 void registerTestDataLayoutPropagation();
@@ -97,10 +99,10 @@ void registerTestExpandMathPass();
 void registerTestFooAnalysisPass();
 void registerTestComposeSubView();
 void registerTestMultiBuffering();
-void registerTestIntRangeInference();
 void registerTestIRVisitorsPass();
 void registerTestGenericIRVisitorsPass();
 void registerTestInterfaces();
+void registerTestIRVisitorsPass();
 void registerTestLastModifiedPass();
 void registerTestLinalgDecomposeOps();
 void registerTestLinalgDropUnitDims();
@@ -110,7 +112,6 @@ void registerTestLinalgTransforms();
 void registerTestLivenessAnalysisPass();
 void registerTestLivenessPass();
 void registerTestLoopFusion();
-void registerTestCFGLoopInfoPass();
 void registerTestLoopMappingPass();
 void registerTestLoopUnrollingPass();
 void registerTestLowerToArmNeon();
@@ -123,12 +124,14 @@ void registerTestMathPolynomialApproximationPass();
 void registerTestMathToVCIXPass();
 void registerTestMemRefDependenceCheck();
 void registerTestMemRefStrideCalculation();
-void registerTestMeshSimplificationsPass();
 void registerTestMeshReshardingSpmdizationPass();
-void registerTestOpLoweringPasses();
+void registerTestMeshSimplificationsPass();
+void registerTestMultiBuffering();
 void registerTestNextAccessPass();
+void registerTestNVGPULowerings();
 void registerTestOneToNTypeConversionPass();
 void registerTestOpaqueLoc();
+void registerTestOpLoweringPasses();
 void registerTestPadFusion();
 void registerTestRecursiveTypesPass();
 void registerTestSCFUpliftWhileToFor();
@@ -141,10 +144,9 @@ void registerTestTensorCopyInsertionPass();
 void registerTestTensorTransforms();
 void registerTestTopologicalSortAnalysisPass();
 void registerTestTransformDialectEraseSchedulePass();
-void registerTestWrittenToPass();
 void registerTestVectorLowerings();
 void registerTestVectorReductionToSPIRVDotProd();
-void registerTestNVGPULowerings();
+void registerTestWrittenToPass();
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 void registerTestDialectConversionPasses();
 void registerTestPDLByteCodePass();
@@ -164,17 +166,17 @@ void registerTestTransformDialectExtension(DialectRegistry &);
 void registerTestPasses() {
   registerCloneTestPasses();
   registerConvertToTargetEnvPass();
-  registerPassManagerTestPass();
-  registerPrintSpirvAvailabilityPass();
   registerLazyLoadingTestPasses();
   registerLoopLikeInterfaceTestPasses();
+  registerPassManagerTestPass();
+  registerPrintSpirvAvailabilityPass();
+  registerRegionTestPasses();
   registerShapeFunctionTestPasses();
   registerSideEffectTestPasses();
   registerSliceAnalysisTestPass();
   registerSymbolTestPasses();
-  registerRegionTestPasses();
-  registerTestAffineDataCopyPass();
   registerTestAffineAccessAnalysisPass();
+  registerTestAffineDataCopyPass();
   registerTestAffineLoopUnswitchingPass();
   registerTestAffineReifyValueBoundsPass();
   registerTestAffineWalk();
@@ -186,18 +188,17 @@ void registerTestPasses() {
   registerTestLoopPermutationPass();
   registerTestMatchers();
   registerTestOperationEqualPass();
+  registerTestPreserveUseListOrders();
   registerTestPrintDefUsePass();
   registerTestPrintInvalidPass();
   registerTestPrintNestingPass();
-  registerTestPreserveUseListOrders();
   registerTestReducer();
   registerTestSpirvEntryPointABIPass();
   registerTestSpirvModuleCombinerPass();
   registerTestTraitsPass();
-  registerVectorizerTestPass();
   registerTosaTestQuantUtilAPIPass();
+  registerVectorizerTestPass();
 
-  mlir::test::registerTestCompositePass();
   mlir::test::registerCommutativityUtils();
   mlir::test::registerConvertCallOpPass();
   mlir::test::registerConvertFuncOpPass();
@@ -212,13 +213,16 @@ void registerTestPasses() {
   mlir::test::registerTestBuiltinDistinctAttributes();
   mlir::test::registerTestCallGraphPass();
   mlir::test::registerTestCfAssertPass();
+  mlir::test::registerTestCFGLoopInfoPass();
+  mlir::test::registerTestComposeSubView();
+  mlir::test::registerTestCompositePass();
   mlir::test::registerTestConstantFold();
   mlir::test::registerTestControlFlowSink();
-  mlir::test::registerTestDiagnosticsPass();
-  mlir::test::registerTestDecomposeCallGraphTypes();
   mlir::test::registerTestDataLayoutPropagation();
   mlir::test::registerTestDataLayoutQuery();
   mlir::test::registerTestDeadCodeAnalysisPass();
+  mlir::test::registerTestDecomposeCallGraphTypes();
+  mlir::test::registerTestDiagnosticsPass();
   mlir::test::registerTestDominancePass();
   mlir::test::registerTestDynamicPipelinePass();
   mlir::test::registerTestEmulateNarrowTypePass();
@@ -226,10 +230,10 @@ void registerTestPasses() {
   mlir::test::registerTestFooAnalysisPass();
   mlir::test::registerTestComposeSubView();
   mlir::test::registerTestMultiBuffering();
-  mlir::test::registerTestIntRangeInference();
   mlir::test::registerTestIRVisitorsPass();
   mlir::test::registerTestGenericIRVisitorsPass();
   mlir::test::registerTestInterfaces();
+  mlir::test::registerTestIRVisitorsPass();
   mlir::test::registerTestLastModifiedPass();
   mlir::test::registerTestLinalgDecomposeOps();
   mlir::test::registerTestLinalgDropUnitDims();
@@ -239,7 +243,6 @@ void registerTestPasses() {
   mlir::test::registerTestLivenessAnalysisPass();
   mlir::test::registerTestLivenessPass();
   mlir::test::registerTestLoopFusion();
-  mlir::test::registerTestCFGLoopInfoPass();
   mlir::test::registerTestLoopMappingPass();
   mlir::test::registerTestLoopUnrollingPass();
   mlir::test::registerTestLowerToArmNeon();
@@ -252,12 +255,14 @@ void registerTestPasses() {
   mlir::test::registerTestMathToVCIXPass();
   mlir::test::registerTestMemRefDependenceCheck();
   mlir::test::registerTestMemRefStrideCalculation();
-  mlir::test::registerTestOpLoweringPasses();
-  mlir::test::registerTestMeshSimplificationsPass();
   mlir::test::registerTestMeshReshardingSpmdizationPass();
+  mlir::test::registerTestMeshSimplificationsPass();
+  mlir::test::registerTestMultiBuffering();
   mlir::test::registerTestNextAccessPass();
+  mlir::test::registerTestNVGPULowerings();
   mlir::test::registerTestOneToNTypeConversionPass();
   mlir::test::registerTestOpaqueLoc();
+  mlir::test::registerTestOpLoweringPasses();
   mlir::test::registerTestPadFusion();
   mlir::test::registerTestRecursiveTypesPass();
   mlir::test::registerTestSCFUpliftWhileToFor();
@@ -272,7 +277,6 @@ void registerTestPasses() {
   mlir::test::registerTestTransformDialectEraseSchedulePass();
   mlir::test::registerTestVectorLowerings();
   mlir::test::registerTestVectorReductionToSPIRVDotProd();
-  mlir::test::registerTestNVGPULowerings();
   mlir::test::registerTestWrittenToPass();
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
   mlir::test::registerTestDialectConversionPasses();
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel
index ff3b035..41b85d2 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel
@@ -28,6 +28,7 @@ libc_test(
     deps = [
         "//libc:__support_big_int",
         "//libc:__support_fputil_dyadic_float",
+        "//libc:__support_macros_properties_types",
         "//libc:__support_uint128",
         "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_wrapper",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 0fc791e..0254e12 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2903,7 +2903,9 @@ td_library(
     ]),
     includes = ["include"],
     deps = [
+        ":ControlFlowInterfacesTdFiles",
         ":InferTypeOpInterfaceTdFiles",
+        ":LoopLikeInterfaceTdFiles",
         ":OpBaseTdFiles",
         ":SideEffectInterfacesTdFiles",
     ],
@@ -3091,9 +3093,11 @@ cc_library(
         ":BufferizationInterfaces",
         ":BytecodeOpInterface",
         ":ComplexDialect",
+        ":ControlFlowInterfaces",
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":LoopLikeInterface",
         ":SideEffectInterfaces",
         ":SparseTensorAttrDefsIncGen",
         ":SparseTensorEnums",
@@ -12713,14 +12717,15 @@ cc_library(
         ":ArithDialect",
         ":ArithPassIncGen",
         ":ArithUtils",
-        ":BufferizationDialect",
         ":BufferizationInterfaces",
         ":BufferizationTransforms",
+        ":DialectUtils",
         ":FuncDialect",
         ":FuncTransforms",
         ":IR",
         ":MemRefDialect",
         ":Pass",
+        ":SideEffectInterfaces",
         ":Support",
         ":TensorDialect",
         ":TransformUtils",
author	Vitaly Buka <vitalybuka@google.com>	2024-06-10 11:47:48 -0700
committer	Vitaly Buka <vitalybuka@google.com>	2024-06-10 11:47:48 -0700
commit	68917b378f7cd70374cb19ea2427a696d20ac8b3 (patch)
tree	995077bc71b75c3eb04a584ceedd2b5d643e07cd
parent	f8dc17608cf7aa14326bc70e343d802eec7f399f (diff)
parent	870bfad71a5bc84102374d94812cf063552493b9 (diff)
download	llvm-users/vitalybuka/spr/main.nfcmsan-extract-handleselectlikeinst.zip llvm-users/vitalybuka/spr/main.nfcmsan-extract-handleselectlikeinst.tar.gz llvm-users/vitalybuka/spr/main.nfcmsan-extract-handleselectlikeinst.tar.bz2