296 files changed, 10191 insertions, 4186 deletions
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 8960b19..5cbc28f 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -781,11 +781,6 @@ public:
     uint64_t PseudoProbeLooseMatchedSampleCount{0};
     ///   the count of call matched samples
     uint64_t CallMatchedSampleCount{0};
-    ///   the number of stale functions that have matching number of blocks in
-    ///   the profile
-    uint64_t NumStaleFuncsWithEqualBlockCount{0};
-    ///   the number of blocks that have matching size but a differing hash
-    uint64_t NumStaleBlocksWithEqualIcount{0};
   } Stats;
 
   // Original binary execution count stats.
diff --git a/bolt/include/bolt/Passes/MarkRAStates.h b/bolt/include/bolt/Passes/MarkRAStates.h
index 675ab97..202f1dd 100644
--- a/bolt/include/bolt/Passes/MarkRAStates.h
+++ b/bolt/include/bolt/Passes/MarkRAStates.h
@@ -13,11 +13,16 @@
 #define BOLT_PASSES_MARK_RA_STATES
 
 #include "bolt/Passes/BinaryPasses.h"
+#include <mutex>
 
 namespace llvm {
 namespace bolt {
 
 class MarkRAStates : public BinaryFunctionPass {
+  // setIgnored() is not thread-safe, but the pass is running on functions in
+  // parallel.
+  std::mutex IgnoreMutex;
+
 public:
   explicit MarkRAStates() : BinaryFunctionPass(false) {}
 
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e1a1856..1d187de 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1508,12 +1508,6 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
   if (NumAllStaleFunctions) {
     const float PctStale =
         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
-    const float PctStaleFuncsWithEqualBlockCount =
-        (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
-        NumAllStaleFunctions * 100.0f;
-    const float PctStaleBlocksWithEqualIcount =
-        (float)BC.Stats.NumStaleBlocksWithEqualIcount /
-        BC.Stats.NumStaleBlocks * 100.0f;
     auto printErrorOrWarning = [&]() {
       if (PctStale > opts::StaleThreshold)
         BC.errs() << "BOLT-ERROR: ";
@@ -1536,17 +1530,6 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
                 << "%) belong to functions with invalid"
                    " (possibly stale) profile.\n";
     }
-    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
-              << " stale function"
-              << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
-              << format(" (%.1f%% of all stale)",
-                        PctStaleFuncsWithEqualBlockCount)
-              << " have matching block count.\n";
-    BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
-              << " stale block"
-              << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
-              << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
-              << " have matching icount.\n";
     if (PctStale > opts::StaleThreshold) {
       return createFatalBOLTError(
           Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index af6a5ca7..b262d66 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -43,10 +43,11 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
         // Not all functions have .cfi_negate_ra_state in them. But if one does,
         // we expect psign/pauth instructions to have the hasNegateRAState
         // annotation.
-        BF.setIgnored();
         BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                   << BF.getPrintName()
                   << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+        std::lock_guard<std::mutex> Lock(IgnoreMutex);
+        BF.setIgnored();
         return false;
       }
     }
@@ -67,6 +68,7 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                     << BF.getPrintName()
                     << ": ptr signing inst encountered in Signed RA state\n";
+          std::lock_guard<std::mutex> Lock(IgnoreMutex);
           BF.setIgnored();
           return false;
         }
@@ -80,6 +82,7 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
                     << BF.getPrintName()
                     << ": ptr authenticating inst encountered in Unsigned RA "
                        "state\n";
+          std::lock_guard<std::mutex> Lock(IgnoreMutex);
           BF.setIgnored();
           return false;
         }
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 086e47b..f0f87f9 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -350,9 +350,6 @@ bool YAMLProfileReader::parseFunctionProfile(
              << MismatchedCalls << " calls, and " << MismatchedEdges
              << " edges in profile did not match function " << BF << '\n';
 
-    if (YamlBF.NumBasicBlocks != BF.size())
-      ++BC.Stats.NumStaleFuncsWithEqualBlockCount;
-
     if (!opts::InferStaleProfile)
       return false;
     ArrayRef<ProbeMatchSpec> ProbeMatchSpecs;
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index 21455db..c4b47a4 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -247,7 +247,7 @@ ClangTidyOptions ClangTidyOptions::getDefaults() {
   Options.WarningsAsErrors = "";
   Options.HeaderFileExtensions = {"", "h", "hh", "hpp", "hxx"};
   Options.ImplementationFileExtensions = {"c", "cc", "cpp", "cxx"};
-  Options.HeaderFilterRegex = "";
+  Options.HeaderFilterRegex = ".*";
   Options.ExcludeHeaderFilterRegex = "";
   Options.SystemHeaders = false;
   Options.FormatStyle = "none";
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 64157f5..1ae8756 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -93,7 +93,7 @@ Configuration files:
     WarningsAsErrors:             ''
     HeaderFileExtensions:         ['', 'h','hh','hpp','hxx']
     ImplementationFileExtensions: ['c','cc','cpp','cxx']
-    HeaderFilterRegex:            ''
+    HeaderFilterRegex:            '.*'
     FormatStyle:                  none
     InheritParentConfig:          true
     User:                         user
@@ -132,14 +132,16 @@ file, if any.
 
 static cl::opt<std::string> HeaderFilter("header-filter", desc(R"(
 Regular expression matching the names of the
-headers to output diagnostics from. Diagnostics
+headers to output diagnostics from. The default
+value is '.*', i.e. diagnostics from all non-system
+headers are displayed by default. Diagnostics
 from the main file of each translation unit are
 always displayed.
 Can be used together with -line-filter.
 This option overrides the 'HeaderFilterRegex'
 option in .clang-tidy file, if any.
 )"),
-                                         cl::init(""),
+                                         cl::init(".*"),
                                          cl::cat(ClangTidyCategory));
 
 static cl::opt<std::string> ExcludeHeaderFilter("exclude-header-filter",
@@ -379,9 +381,9 @@ static void printStats(const ClangTidyStats &Stats) {
                    << " with check filters";
     llvm::errs() << ").\n";
     if (Stats.ErrorsIgnoredNonUserCode)
-      llvm::errs() << "Use -header-filter=.* to display errors from all "
-                      "non-system headers. Use -system-headers to display "
-                      "errors from system headers as well.\n";
+      llvm::errs() << "Use -header-filter=.* or leave it as default to display "
+                      "errors from all non-system headers. Use -system-headers "
+                      "to display errors from system headers as well.\n";
   }
 }
 
diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
index 086c7f3..b30c83e 100644
--- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
@@ -21,6 +21,11 @@ FixItHint changeVarDeclToReference(const VarDecl &Var, ASTContext &Context) {
   SourceLocation AmpLocation = Var.getLocation();
   auto Token = utils::lexer::getPreviousToken(
       AmpLocation, Context.getSourceManager(), Context.getLangOpts());
+
+  // For parameter packs the '&' must go before the '...' token
+  if (Token.is(tok::ellipsis))
+    return FixItHint::CreateInsertion(Token.getLocation(), "&");
+
   if (!Token.is(tok::unknown))
     AmpLocation = Lexer::getLocForEndOfToken(Token.getLocation(), 0,
                                              Context.getSourceManager(),
diff --git a/clang-tools-extra/clangd/support/Markup.cpp b/clang-tools-extra/clangd/support/Markup.cpp
index 304917d..9ba993a 100644
--- a/clang-tools-extra/clangd/support/Markup.cpp
+++ b/clang-tools-extra/clangd/support/Markup.cpp
@@ -475,31 +475,61 @@ std::string Block::asPlainText() const {
   return llvm::StringRef(OS.str()).trim().str();
 }
 
+void Paragraph::renderNewlinesMarkdown(llvm::raw_ostream &OS,
+                                       llvm::StringRef ParagraphText) const {
+  llvm::StringRef Line, Rest;
+
+  for (std::tie(Line, Rest) = ParagraphText.ltrim("\n").rtrim().split('\n');
+       !(Line.empty() && Rest.empty());
+       std::tie(Line, Rest) = Rest.split('\n')) {
+
+    if (Line.empty()) {
+      // Blank lines are preserved in markdown.
+      OS << '\n';
+      continue;
+    }
+
+    OS << Line;
+
+    if (!Rest.empty() && isHardLineBreakAfter(Line, Rest, /*IsMarkdown=*/true))
+      // In markdown, 2 spaces before a line break forces a line break.
+      OS << "  ";
+    OS << '\n';
+  }
+}
+
 void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const {
   bool NeedsSpace = false;
   bool HasChunks = false;
+  std::string ParagraphText;
+  ParagraphText.reserve(EstimatedStringSize);
+  llvm::raw_string_ostream ParagraphTextOS(ParagraphText);
   for (auto &C : Chunks) {
     if (C.SpaceBefore || NeedsSpace)
-      OS << " ";
+      ParagraphTextOS << " ";
     switch (C.Kind) {
     case ChunkKind::PlainText:
-      OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/true);
+      ParagraphTextOS << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/true);
       break;
     case ChunkKind::InlineCode:
-      OS << renderInlineBlock(C.Contents);
+      ParagraphTextOS << renderInlineBlock(C.Contents);
       break;
     case ChunkKind::Bold:
-      OS << renderText("**" + C.Contents + "**", !HasChunks,
-                       /*EscapeMarkdown=*/true);
+      ParagraphTextOS << renderText("**" + C.Contents + "**", !HasChunks,
+                                    /*EscapeMarkdown=*/true);
       break;
     case ChunkKind::Emphasized:
-      OS << renderText("*" + C.Contents + "*", !HasChunks,
-                       /*EscapeMarkdown=*/true);
+      ParagraphTextOS << renderText("*" + C.Contents + "*", !HasChunks,
+                                    /*EscapeMarkdown=*/true);
       break;
     }
     HasChunks = true;
     NeedsSpace = C.SpaceAfter;
   }
+
+  renderNewlinesMarkdown(OS, ParagraphText);
+
   // A paragraph in markdown is separated by a blank line.
   OS << "\n\n";
 }
@@ -507,28 +537,39 @@ void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const {
 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
   bool NeedsSpace = false;
   bool HasChunks = false;
+  std::string ParagraphText;
+  ParagraphText.reserve(EstimatedStringSize);
+  llvm::raw_string_ostream ParagraphTextOS(ParagraphText);
   for (auto &C : Chunks) {
     if (C.SpaceBefore || NeedsSpace)
-      OS << " ";
+      ParagraphTextOS << " ";
     switch (C.Kind) {
     case ChunkKind::PlainText:
-      OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false);
+      ParagraphTextOS << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/false);
       break;
     case ChunkKind::InlineCode:
-      OS << renderInlineBlock(C.Contents);
+      ParagraphTextOS << renderInlineBlock(C.Contents);
       break;
     case ChunkKind::Bold:
-      OS << "**" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false)
-         << "**";
+      ParagraphTextOS << "**"
+                      << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/false)
+                      << "**";
       break;
     case ChunkKind::Emphasized:
-      OS << "*" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false)
-         << "*";
+      ParagraphTextOS << "*"
+                      << renderText(C.Contents, !HasChunks,
+                                    /*EscapeMarkdown=*/false)
+                      << "*";
       break;
     }
     HasChunks = true;
     NeedsSpace = C.SpaceAfter;
   }
+
+  renderNewlinesMarkdown(OS, ParagraphText);
+
   // A paragraph in markdown is separated by a blank line.
   OS << "\n\n";
 }
@@ -537,8 +578,6 @@ std::unique_ptr<Block> Paragraph::clone() const {
   return std::make_unique<Paragraph>(*this);
 }
 
-/// Choose a marker to delimit `Text` from a prioritized list of options.
-/// This is more readable than escaping for plain-text.
 llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
                                         llvm::StringRef Text) const {
   // Prefer a delimiter whose characters don't appear in the text.
@@ -548,23 +587,36 @@ llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
   return Options.front();
 }
 
-bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line) const {
+bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line,
+                                              bool IsMarkdown) const {
   constexpr llvm::StringLiteral Punctuation = R"txt(.:,;!?)txt";
 
+  if (!IsMarkdown && Line.ends_with("  "))
+    return true;
+
   Line = Line.rtrim();
   return !Line.empty() && Punctuation.contains(Line.back());
 }
 
-bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const {
+bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest,
+                                         bool IsMarkdown) const {
+  // Plaintext indicators:
   // '-'/'*' md list, '@'/'\' documentation command, '>' md blockquote,
-  // '#' headings, '`' code blocks, two spaces (markdown force newline)
-  constexpr llvm::StringLiteral LinebreakIndicators = R"txt(-*@\>#`)txt";
+  // '#' headings, '`' code blocks
+  constexpr llvm::StringLiteral LinebreakIndicatorsPlainText =
+      R"txt(-*@\>#`)txt";
+  // Markdown indicators:
+  // Only '@' and '\' documentation commands/escaped markdown syntax.
+  constexpr llvm::StringLiteral LinebreakIndicatorsMarkdown = R"txt(@\)txt";
 
   Rest = Rest.ltrim(" \t");
   if (Rest.empty())
     return false;
 
-  if (LinebreakIndicators.contains(Rest.front()))
+  if (IsMarkdown)
+    return LinebreakIndicatorsMarkdown.contains(Rest.front());
+
+  if (LinebreakIndicatorsPlainText.contains(Rest.front()))
     return true;
 
   if (llvm::isDigit(Rest.front())) {
@@ -575,64 +627,18 @@ bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const {
   return false;
 }
 
-bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line,
-                                     llvm::StringRef Rest) const {
-  // In Markdown, 2 spaces before a line break forces a line break.
-  // Add a line break for plaintext in this case too.
+bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest,
+                                     bool IsMarkdown) const {
   // Should we also consider whether Line is short?
-  return Line.ends_with("  ") || punctuationIndicatesLineBreak(Line) ||
-         isHardLineBreakIndicator(Rest);
+  return punctuationIndicatesLineBreak(Line, IsMarkdown) ||
+         isHardLineBreakIndicator(Rest, IsMarkdown);
 }
 
-void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
-  bool NeedsSpace = false;
-  std::string ConcatenatedText;
-  ConcatenatedText.reserve(EstimatedStringSize);
-
-  llvm::raw_string_ostream ConcatenatedOS(ConcatenatedText);
-
-  for (auto &C : Chunks) {
-
-    if (C.Kind == ChunkKind::PlainText) {
-      if (C.SpaceBefore || NeedsSpace)
-        ConcatenatedOS << ' ';
-
-      ConcatenatedOS << C.Contents;
-      NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter;
-      continue;
-    }
-
-    if (C.SpaceBefore || NeedsSpace)
-      ConcatenatedOS << ' ';
-    llvm::StringRef Marker = "";
-    if (C.Preserve && C.Kind == ChunkKind::InlineCode)
-      Marker = chooseMarker({"`", "'", "\""}, C.Contents);
-    else if (C.Kind == ChunkKind::Bold)
-      Marker = "**";
-    else if (C.Kind == ChunkKind::Emphasized)
-      Marker = "*";
-    ConcatenatedOS << Marker << C.Contents << Marker;
-    NeedsSpace = C.SpaceAfter;
-  }
-
-  // We go through the contents line by line to handle the newlines
-  // and required spacing correctly.
-  //
-  // Newlines are added if:
-  // - the line ends with 2 spaces and a newline follows
-  // - the line ends with punctuation that indicates a line break (.:,;!?)
-  // - the next line starts with a hard line break indicator (-@>#`, or a digit
-  //   followed by '.' or ')'), ignoring leading whitespace.
-  //
-  // Otherwise, newlines in the input are replaced with a single space.
-  //
-  // Multiple spaces are collapsed into a single space.
-  //
-  // Lines containing only whitespace are ignored.
+void Paragraph::renderNewlinesPlaintext(llvm::raw_ostream &OS,
+                                        llvm::StringRef ParagraphText) const {
   llvm::StringRef Line, Rest;
 
-  for (std::tie(Line, Rest) =
-           llvm::StringRef(ConcatenatedText).trim().split('\n');
+  for (std::tie(Line, Rest) = ParagraphText.trim().split('\n');
        !(Line.empty() && Rest.empty());
        std::tie(Line, Rest) = Rest.split('\n')) {
 
@@ -653,7 +659,7 @@ void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
 
     OS << canonicalizeSpaces(Line);
 
-    if (isHardLineBreakAfter(Line, Rest))
+    if (isHardLineBreakAfter(Line, Rest, /*IsMarkdown=*/false))
       OS << '\n';
     else if (!Rest.empty())
       // Since we removed any trailing whitespace from the input using trim(),
@@ -661,6 +667,40 @@ void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
       // Therefore, we can add a space without worrying about trailing spaces.
       OS << ' ';
   }
+}
+
+void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
+  bool NeedsSpace = false;
+  std::string ParagraphText;
+  ParagraphText.reserve(EstimatedStringSize);
+
+  llvm::raw_string_ostream ParagraphTextOS(ParagraphText);
+
+  for (auto &C : Chunks) {
+
+    if (C.Kind == ChunkKind::PlainText) {
+      if (C.SpaceBefore || NeedsSpace)
+        ParagraphTextOS << ' ';
+
+      ParagraphTextOS << C.Contents;
+      NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter;
+      continue;
+    }
+
+    if (C.SpaceBefore || NeedsSpace)
+      ParagraphTextOS << ' ';
+    llvm::StringRef Marker = "";
+    if (C.Preserve && C.Kind == ChunkKind::InlineCode)
+      Marker = chooseMarker({"`", "'", "\""}, C.Contents);
+    else if (C.Kind == ChunkKind::Bold)
+      Marker = "**";
+    else if (C.Kind == ChunkKind::Emphasized)
+      Marker = "*";
+    ParagraphTextOS << Marker << C.Contents << Marker;
+    NeedsSpace = C.SpaceAfter;
+  }
+
+  renderNewlinesPlaintext(OS, ParagraphText);
 
   // Paragraphs are separated by a blank line.
   OS << "\n\n";
diff --git a/clang-tools-extra/clangd/support/Markup.h b/clang-tools-extra/clangd/support/Markup.h
index eea6328..219a7da 100644
--- a/clang-tools-extra/clangd/support/Markup.h
+++ b/clang-tools-extra/clangd/support/Markup.h
@@ -92,9 +92,84 @@ private:
 
   llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
                                llvm::StringRef Text) const;
-  bool punctuationIndicatesLineBreak(llvm::StringRef Line) const;
-  bool isHardLineBreakIndicator(llvm::StringRef Rest) const;
-  bool isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest) const;
+
+  /// Checks if the given line ends with punctuation that indicates a line break
+  /// (.:,;!?).
+  ///
+  /// If \p IsMarkdown is false, lines ending with 2 spaces are also considered
+  /// as indicating a line break. This is not needed for markdown because the
+  /// client renderer will handle this case.
+  bool punctuationIndicatesLineBreak(llvm::StringRef Line,
+                                     bool IsMarkdown) const;
+
+  /// Checks if the given line starts with a hard line break indicator.
+  ///
+  /// If \p IsMarkdown is true, only '@' and '\' are considered as indicators.
+  /// Otherwise, '-', '*', '@', '\', '>', '#', '`' and a digit followed by '.'
+  /// or ')' are also considered as indicators.
+  bool isHardLineBreakIndicator(llvm::StringRef Rest, bool IsMarkdown) const;
+
+  /// Checks if a hard line break should be added after the given line.
+  bool isHardLineBreakAfter(llvm::StringRef Line, llvm::StringRef Rest,
+                            bool IsMarkdown) const;
+
+  /// \brief Go through the contents line by line to handle the newlines
+  /// and required spacing correctly for markdown rendering.
+  ///
+  /// Newlines are added if:
+  /// - the line ends with punctuation that indicates a line break (.:,;!?)
+  /// - the next line starts with a hard line break indicator \\ (escaped
+  /// markdown/doxygen command) or @ (doxygen command)
+  ///
+  /// This newline handling is only used when the client requests markdown
+  /// for hover/signature help content.
+  /// Markdown does not add any newlines inside paragraphs unless the user
+  /// explicitly adds them. For hover/signature help content, we still want to
+  /// add newlines in some cases to improve readability, especially when doxygen
+  /// parsing is disabled or not implemented (like for signature help).
+  /// Therefore we add newlines in the above mentioned cases.
+  ///
+  /// In addition to that, we need to consider that the user can configure
+  /// clangd to treat documentation comments as plain text, while the client
+  /// requests markdown.
+  /// In this case, all markdown syntax is escaped and will
+  /// not be rendered as expected by markdown.
+  /// Examples are lists starting with '-' or headings starting with '#'.
+  /// With the above next line heuristics, these cases are also covered by the
+  /// '\\' new line indicator.
+  ///
+  /// FIXME: The heuristic fails e.g. for lists starting with '*' because it is
+  /// also used for emphasis in markdown and should not be treated as a newline.
+  ///
+  /// \param OS The stream to render to.
+  /// \param ParagraphText The text of the paragraph to render.
+  void renderNewlinesMarkdown(llvm::raw_ostream &OS,
+                              llvm::StringRef ParagraphText) const;
+
+  /// \brief Go through the contents line by line to handle the newlines
+  /// and required spacing correctly for plain text rendering.
+  ///
+  /// Newlines are added if:
+  /// - the line ends with 2 spaces and a newline follows
+  /// - the line ends with punctuation that indicates a line break (.:,;!?)
+  /// - the next line starts with a hard line break indicator (-@>#`\\ or a
+  ///   digit followed by '.' or ')'), ignoring leading whitespace.
+  ///
+  /// Otherwise, newlines in the input are replaced with a single space.
+  ///
+  /// Multiple spaces are collapsed into a single space.
+  ///
+  /// Lines containing only whitespace are ignored.
+  ///
+  /// This newline handling is only used when the client requests plain
+  /// text for hover/signature help content.
+  /// Therefore with this approach we mimic the behavior of markdown rendering
+  /// for these clients.
+  ///
+  /// \param OS The stream to render to.
+  /// \param ParagraphText The text of the paragraph to render.
+  void renderNewlinesPlaintext(llvm::raw_ostream &OS,
+                               llvm::StringRef ParagraphText) const;
 };
 
 /// Represents a sequence of one or more documents. Knows how to print them in a
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 718c1bc..eb858ff 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -4087,16 +4087,16 @@ Parameters:
 
 @brief brief doc
 
-longer doc
+longer doc  
 @note this is a note
 
-As you see, notes are "inlined".
+As you see, notes are "inlined".  
 @warning this is a warning
 
-As well as warnings
-@param a this is a param
-@return it returns something
-@retval 0 if successful
+As well as warnings  
+@param a this is a param  
+@return it returns something  
+@retval 0 if successful  
 @retval 1 if failed
 
 ---
@@ -4166,9 +4166,9 @@ Parameters:
 
 @brief brief doc
 
-longer doc
-@param a this is a param
-@param b does not exist
+longer doc  
+@param a this is a param  
+@param b does not exist  
 @return it returns something
 
 ---
@@ -4315,19 +4315,19 @@ TEST(Hover, ParseDocumentation) {
                },
                {
                    "foo.\nbar",
-                   "foo.\nbar",
-                   "foo.\nbar",
+                   "foo.  \nbar",
+                   "foo.  \nbar",
                    "foo.\nbar",
                },
                {
                    "foo. \nbar",
-                   "foo. \nbar",
-                   "foo. \nbar",
+                   "foo.   \nbar",
+                   "foo.   \nbar",
                    "foo.\nbar",
                },
                {
                    "foo\n*bar",
-                   "foo\n\\*bar",
+                   "foo  \n\\*bar",
                    "foo\n*bar",
                    "foo\n*bar",
                },
@@ -4354,6 +4354,24 @@ TEST(Hover, ParseDocumentation) {
                    "\\`not\nparsed\\`",
                    "`not\nparsed`",
                    "`not parsed`",
+               },
+               {
+                   R"(@brief this is a typical use case
+@param x this is x
+\param y this is y
+@return something)",
+                   R"(@brief this is a typical use case  
+@param x this is x  
+\\param y this is y  
+@return something)",
+                   R"(@brief this is a typical use case  
+@param x this is x  
+\param y this is y  
+@return something)",
+                   R"(@brief this is a typical use case
+@param x this is x
+\param y this is y
+@return something)",
                }};
 
   for (const auto &C : Cases) {
diff --git a/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp b/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp
index b3185cc..676f7df 100644
--- a/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp
@@ -195,10 +195,10 @@ More description documentation)",
 normal text<i>this is an italic text</i>
 <code>this is a code block</code>)",
           R"(\<b>this is a bold text\</b>
-normal text\<i>this is an italic text\</i>
+normal text\<i>this is an italic text\</i>  
 \<code>this is a code block\</code>)",
           R"(\<b>this is a bold text\</b>
-normal text\<i>this is an italic text\</i>
+normal text\<i>this is an italic text\</i>  
 \<code>this is a code block\</code>)",
           "<b>this is a bold text</b> normal text<i>this is an italic text</i> "
           "<code>this is a code block</code>",
@@ -712,10 +712,10 @@ TEST(SymbolDocumentation, MarkdownCodeSpans) {
 line
 \c span`)",
        R"(\`multi
-line
+line  
 \\c span\`)",
        R"(`multi
-line
+line  
 \c span`)",
        R"(`multi line
 \c span`)"},
diff --git a/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp b/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp
index 5f91f31..af4782c0 100644
--- a/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp
+++ b/clang-tools-extra/clangd/unittests/support/MarkupTests.cpp
@@ -304,9 +304,9 @@ TEST(Paragraph, SeparationOfChunks) {
 
   P.appendSpace().appendCode("code").appendText(".\n  newline");
   EXPECT_EQ(P.asEscapedMarkdown(),
-            "after `foobar` bat`no` `space` text `code`.\n  newline");
+            "after `foobar` bat`no` `space` text `code`.  \n  newline");
   EXPECT_EQ(P.asMarkdown(),
-            "after `foobar` bat`no` `space` text `code`.\n  newline");
+            "after `foobar` bat`no` `space` text `code`.  \n  newline");
   EXPECT_EQ(P.asPlainText(), "after foobar batno space text code.\nnewline");
 }
 
@@ -371,21 +371,117 @@ TEST(Paragraph, SeparationOfChunks3) {
   EXPECT_EQ(P.asPlainText(), "after\nfoobar");
 
   P.appendText("- bat\n");
-  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar\n\\- bat");
+  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar  \n\\- bat");
   EXPECT_EQ(P.asMarkdown(), "after  \n  foobar\n- bat");
   EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat");
 
   P.appendText("- baz");
-  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar\n\\- bat\n\\- baz");
+  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar  \n\\- bat  \n\\- baz");
   EXPECT_EQ(P.asMarkdown(), "after  \n  foobar\n- bat\n- baz");
   EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat\n- baz");
 
   P.appendText(" faz ");
-  EXPECT_EQ(P.asEscapedMarkdown(), "after  \n  foobar\n\\- bat\n\\- baz faz");
+  EXPECT_EQ(P.asEscapedMarkdown(),
+            "after  \n  foobar  \n\\- bat  \n\\- baz faz");
   EXPECT_EQ(P.asMarkdown(), "after  \n  foobar\n- bat\n- baz faz");
   EXPECT_EQ(P.asPlainText(), "after\nfoobar\n- bat\n- baz faz");
 }
 
+TEST(Paragraph, PunctuationLineBreaks) {
+
+  struct {
+    std::string Text;
+    std::string EscapedMarkdown;
+    std::string Markdown;
+    std::string PlainText;
+  } Cases[] = {
+      {"Line ending with dot.\nForces a visual linebreak.",
+       "Line ending with dot.  \nForces a visual linebreak.",
+       "Line ending with dot.  \nForces a visual linebreak.",
+       "Line ending with dot.\nForces a visual linebreak."},
+      {"Line ending with colon:\nForces a visual linebreak.",
+       "Line ending with colon:  \nForces a visual linebreak.",
+       "Line ending with colon:  \nForces a visual linebreak.",
+       "Line ending with colon:\nForces a visual linebreak."},
+      {"Line ending with semicolon:\nForces a visual linebreak.",
+       "Line ending with semicolon:  \nForces a visual linebreak.",
+       "Line ending with semicolon:  \nForces a visual linebreak.",
+       "Line ending with semicolon:\nForces a visual linebreak."},
+      {"Line ending with comma,\nForces a visual linebreak.",
+       "Line ending with comma,  \nForces a visual linebreak.",
+       "Line ending with comma,  \nForces a visual linebreak.",
+       "Line ending with comma,\nForces a visual linebreak."},
+      {"Line ending with exclamation mark!\nForces a visual linebreak.",
+       "Line ending with exclamation mark!  \nForces a visual linebreak.",
+       "Line ending with exclamation mark!  \nForces a visual linebreak.",
+       "Line ending with exclamation mark!\nForces a visual linebreak."},
+      {"Line ending with question mark?\nForces a visual linebreak.",
+       "Line ending with question mark?  \nForces a visual linebreak.",
+       "Line ending with question mark?  \nForces a visual linebreak.",
+       "Line ending with question mark?\nForces a visual linebreak."},
+  };
+
+  for (const auto &C : Cases) {
+    Paragraph P;
+    P.appendText(C.Text);
+    EXPECT_EQ(P.asEscapedMarkdown(), C.EscapedMarkdown);
+    EXPECT_EQ(P.asMarkdown(), C.Markdown);
+    EXPECT_EQ(P.asPlainText(), C.PlainText);
+  }
+}
+
+TEST(Paragraph, LineBreakIndicators) {
+
+  struct {
+    std::string Text;
+    std::string EscapedMarkdown;
+    std::string Markdown;
+    std::string PlainText;
+  } Cases[] = {
+      {"Visual linebreak for\n- list items\n- and so on",
+       "Visual linebreak for  \n\\- list items  \n\\- and so on",
+       "Visual linebreak for\n- list items\n- and so on",
+       "Visual linebreak for\n- list items\n- and so on"},
+      {"Visual linebreak for\n* list items\n* and so on",
+       "Visual linebreak for  \n\\* list items  \n\\* and so on",
+       "Visual linebreak for\n* list items\n* and so on",
+       "Visual linebreak for\n* list items\n* and so on"},
+      {"Visual linebreak for\n@command any doxygen command\n\\other other "
+       "doxygen command",
+       "Visual linebreak for  \n@command any doxygen command  \n\\\\other "
+       "other doxygen command",
+       "Visual linebreak for  \n@command any doxygen command  \n\\other other "
+       "doxygen command",
+       "Visual linebreak for\n@command any doxygen command\n\\other other "
+       "doxygen command"},
+      {"Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2",
+       "Visual linebreak for  \n\\>blockquoute line 1  \n\\> blockquoute line "
+       "2",
+       "Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2",
+       "Visual linebreak for\n>blockquoute line 1\n> blockquoute line 2"},
+      {"Visual linebreak for\n# Heading 1\ntext under heading\n## Heading "
+       "2\ntext under heading 2",
+       "Visual linebreak for  \n\\# Heading 1\ntext under heading  \n\\## "
+       "Heading 2\ntext under heading 2",
+       "Visual linebreak for\n# Heading 1\ntext under heading\n## Heading "
+       "2\ntext under heading 2",
+       "Visual linebreak for\n# Heading 1 text under heading\n## Heading 2 "
+       "text under heading 2"},
+      {"Visual linebreak for\n`inline code`",
+       "Visual linebreak for  \n\\`inline code\\`",
+       "Visual linebreak for\n`inline code`",
+       "Visual linebreak for\n`inline code`"},
+  };
+
+  for (const auto &C : Cases) {
+    Paragraph P;
+    P.appendText(C.Text);
+    EXPECT_EQ(P.asEscapedMarkdown(), C.EscapedMarkdown);
+    EXPECT_EQ(P.asMarkdown(), C.Markdown);
+    EXPECT_EQ(P.asPlainText(), C.PlainText);
+  }
+}
+
 TEST(Paragraph, ExtraSpaces) {
   // Make sure spaces inside chunks are preserved for markdown
   // and dropped for plain text.
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 915b793..8f4be0d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -70,6 +70,11 @@ Potentially Breaking Changes
     :doc:`bugprone-signed-char-misuse
     <clang-tidy/checks/bugprone/signed-char-misuse>`
 
+- :program:`clang-tidy` now displays warnings from all non-system headers by
+  default. Previously, users had to explicitly opt-in to header warnings using
+  `-header-filter='.*'`. To disable warnings from non-system, set `-header-filter`
+  to an empty string.
+
 Improvements to clangd
 ----------------------
 
@@ -132,6 +137,11 @@ Improvements to clang-tidy
   when run over C files. If ``-std`` is not specified, it defaults to
   ``c99-or-later``.
 
+- :program:`clang-tidy` now displays warnings from all non-system headers by
+  default. Previously, users had to explicitly opt-in to header warnings using
+  `-header-filter='.*'`. To disable warnings from non-system, set `-header-filter`
+  to an empty string.
+
 - :program:`clang-tidy` no longer attempts to analyze code from system headers
   by default, greatly improving performance. This behavior is disabled if the
   `SystemHeaders` option is enabled.
@@ -407,7 +417,8 @@ Changes in existing checks
 
 - Improved :doc:`performance-unnecessary-value-param
   <clang-tidy/checks/performance/unnecessary-value-param>` by printing
-  the type of the diagnosed variable.
+  the type of the diagnosed variable and correctly generating fix-it hints for
+  parameter-pack arguments.
 
 - Improved :doc:`portability-template-virtual-member-function
   <clang-tidy/checks/portability/template-virtual-member-function>` check to
diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index bd2c40e..6ff82bf 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -215,7 +215,9 @@ An overview of all the command-line options:
                                        This option overrides the 'FormatStyle` option in
                                        .clang-tidy file, if any.
     --header-filter=<string>         - Regular expression matching the names of the
-                                       headers to output diagnostics from. Diagnostics
+                                       headers to output diagnostics from. The default
+                                       value is '.*', i.e. diagnostics from all non-system
+                                       headers are displayed by default. Diagnostics
                                        from the main file of each translation unit are
                                        always displayed.
                                        Can be used together with -line-filter.
@@ -338,7 +340,7 @@ An overview of all the command-line options:
       WarningsAsErrors:    ''
       HeaderFileExtensions:         ['', 'h','hh','hpp','hxx']
       ImplementationFileExtensions: ['c','cc','cpp','cxx']
-      HeaderFilterRegex:   ''
+      HeaderFilterRegex:   '.*'
       FormatStyle:         none
       InheritParentConfig: true
       User:                user
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp
index 2949d7f..f6eb7c5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-internal-dependencies.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s abseil-no-internal-dependencies %t,  -- -- -I %S/Inputs
+// RUN: %check_clang_tidy %s abseil-no-internal-dependencies %t,  -- -header-filter='' -- -I %S/Inputs
 // RUN: clang-tidy -checks='-*, abseil-no-internal-dependencies' -header-filter='.*' %s -- -I %S/Inputs 2>&1 | FileCheck %s
 
 #include "absl/strings/internal-file.h"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp
index 78821c3..c8a5752 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/no-namespace.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s abseil-no-namespace %t -- -- -I %S/Inputs
+// RUN: %check_clang_tidy %s abseil-no-namespace %t -- -header-filter='' -- -I %S/Inputs
 // RUN: clang-tidy -checks='-*, abseil-no-namespace' -header-filter='.*' %s -- -I %S/Inputs 2>&1 | FileCheck %s
 
 /// Warning will not be triggered on internal Abseil code that is included.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp
index 0f36efe..b17e890 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/reserved-identifier.cpp
@@ -1,8 +1,9 @@
-// RUN: %check_clang_tidy %s bugprone-reserved-identifier %t -- -- \
+// RUN: %check_clang_tidy %s bugprone-reserved-identifier %t -- \
+// RUN:   -header-filter='' -- \
 // RUN:   -I%S/Inputs/reserved-identifier \
 // RUN:   -isystem %S/Inputs/reserved-identifier/system
 
-// no warnings expected without -header-filter=
+// no warnings expected with -header-filter=''
 #include "user-header.h"
 #include <system-header.h>
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
index edb11b9..5b30541 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
@@ -1,5 +1,5 @@
-// RUN: %check_clang_tidy %s google-upgrade-googletest-case %t -- -- -I%S/Inputs
-// RUN: %check_clang_tidy -check-suffix=NOSUITE %s google-upgrade-googletest-case %t -- -- -DNOSUITE -I%S/Inputs/gtest/nosuite
+// RUN: %check_clang_tidy %s google-upgrade-googletest-case %t -- -- -isystem%S/Inputs
+// RUN: %check_clang_tidy -check-suffix=NOSUITE %s google-upgrade-googletest-case %t -- -- -DNOSUITE -isystem%S/Inputs/gtest/nosuite
 
 #include "gtest/gtest.h"
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
index 2281c1a..371f3dd 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -I %S/Inputs/replace-auto-ptr
+// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -isystem %S/Inputs/replace-auto-ptr
 
 // CHECK-FIXES: #include <utility>
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
index 8288f39..5b8eca2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s modernize-use-using %t -- -- -fno-delayed-template-parsing -I %S/Inputs/use-using/
+// RUN: %check_clang_tidy %s modernize-use-using %t -- -- -fno-delayed-template-parsing -isystem %S/Inputs/use-using/
 
 typedef int Type;
 // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp
index 688c79b..61758c5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp
@@ -96,3 +96,34 @@ void lambdaNonConstAutoValue() {
   };
   fn(ExpensiveToCopyType());
 }
+
+template <typename... Args>
+void ParameterPack(Args... args) {
+  // CHECK-MESSAGES: [[@LINE-1]]:28: warning: the parameter 'args' of type 'ExpensiveToCopyType'
+  // CHECK-FIXES: void ParameterPack(const Args&... args) {
+}
+
+template <typename... Args>
+void ParameterPackConst(Args const... args) {
+  // CHECK-MESSAGES: [[@LINE-1]]:39: warning: the const qualified parameter 'args' of type 'const ExpensiveToCopyType'
+  // CHECK-FIXES: void ParameterPackConst(Args const&... args) {
+}
+
+template <typename... Args>
+void ParameterPackWithParams(const ExpensiveToCopyType E1, ExpensiveToCopyType E2, Args... args) {
+  // CHECK-MESSAGES: [[@LINE-1]]:56: warning: the const qualified parameter 'E1'
+  // CHECK-MESSAGES: [[@LINE-2]]:80: warning: the parameter 'E2'
+  // CHECK-MESSAGES: [[@LINE-3]]:92: warning: the parameter 'args'
+  // CHECK-FIXES: void ParameterPackWithParams(const ExpensiveToCopyType& E1, const ExpensiveToCopyType& E2, const Args&... args) {
+}
+
+template <typename... Args>
+void PackWithNonExpensive(int x, Args... args) {}
+
+void instantiatedParameterPack() {
+  ExpensiveToCopyType E;
+  ParameterPack(E);
+  ParameterPackConst(E);
+  ParameterPackWithParams(E, E, E);
+  PackWithNonExpensive(5, 5);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp
index 223f077..c452f69 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/duplicate-include.cpp
@@ -1,4 +1,6 @@
-// RUN: %check_clang_tidy %s readability-duplicate-include %t -- -- -isystem %S/Inputs/duplicate-include/system -I %S/Inputs/duplicate-include
+// RUN: %check_clang_tidy %s readability-duplicate-include %t -- \
+// RUN:   -header-filter='' \
+// RUN:   -- -isystem %S/Inputs/duplicate-include/system -I %S/Inputs/duplicate-include
 
 int a;
 #include <string.h>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
index 9180733..1d06df3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
@@ -86,7 +86,9 @@
 // RUN:     readability-identifier-naming.LocalPointerPrefix: 'l_', \
 // RUN:     readability-identifier-naming.LocalConstantPointerCase: CamelCase, \
 // RUN:     readability-identifier-naming.LocalConstantPointerPrefix: 'lc_', \
-// RUN:   }}' -- -fno-delayed-template-parsing -Dbad_macro \
+// RUN:   }}' \
+// RUN:   -header-filter='' \
+// RUN:   -- -fno-delayed-template-parsing -Dbad_macro \
 // RUN:   -I%S/Inputs/identifier-naming \
 // RUN:   -isystem %S/Inputs/identifier-naming/system
 
@@ -95,8 +97,7 @@
 #include <system-header.h>
 #include <coroutines.h>
 #include "user-header.h"
-// NO warnings or fixes expected from declarations within header files without
-// the -header-filter= option
+// NO warnings or fixes expected from declarations with the -header-filter='' option
 
 namespace FOO_NS {
 // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: invalid case style for namespace 'FOO_NS' [readability-identifier-naming]
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp
new file mode 100644
index 0000000..489b302
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/default-header-filter.cpp
@@ -0,0 +1,27 @@
+
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -header-filter='' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-EMPTY %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -header-filter='.*' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-EXPLICIT %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-NO-SYSTEM %s
+// RUN: clang-tidy -checks='-*,google-explicit-constructor' --config='{}' -system-headers %s -- -I %S/Inputs/file-filter -isystem %S/Inputs/file-filter/system 2>&1 | FileCheck --check-prefix=CHECK-WITH-SYSTEM %s
+
+#include "header1.h"
+// CHECK-DEFAULT: header1.h:1:12: warning: single-argument constructors must be marked explicit
+// CHECK-EMPTY-NOT: header1.h:1:12: warning:
+// CHECK-EXPLICIT: header1.h:1:12: warning: single-argument constructors must be marked explicit
+// CHECK-NO-SYSTEM: header1.h:1:12: warning: single-argument constructors must be marked explicit
+// CHECK-WITH-SYSTEM-DAG: header1.h:1:12: warning: single-argument constructors must be marked explicit
+
+#include <system-header.h>
+// CHECK-DEFAULT-NOT: system-header.h:1:12: warning:
+// CHECK-EMPTY-NOT: system-header.h:1:12: warning:
+// CHECK-EXPLICIT-NOT: system-header.h:1:12: warning:
+// CHECK-NO-SYSTEM-NOT: system-header.h:1:12: warning:
+// CHECK-WITH-SYSTEM-DAG: system-header.h:1:12: warning: single-argument constructors must be marked explicit
+
+class A { A(int); };
+// CHECK-DEFAULT: :[[@LINE-1]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-EMPTY: :[[@LINE-2]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-EXPLICIT: :[[@LINE-3]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-NO-SYSTEM: :[[@LINE-4]]:11: warning: single-argument constructors must be marked explicit
+// CHECK-WITH-SYSTEM: :[[@LINE-5]]:11: warning: single-argument constructors must be marked explicit
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp
index d9ec104..485e9fb 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp
@@ -66,7 +66,7 @@ class A { A(int); };
 // CHECK4-NOT: warning:
 // CHECK4-QUIET-NOT: warning:
 
-// CHECK: Use -header-filter=.* to display errors from all non-system headers.
+// CHECK: Use -header-filter=.* or leave it as default to display errors from all non-system headers.
 // CHECK-QUIET-NOT: Suppressed
 // CHECK2-QUIET-NOT: Suppressed
 // CHECK3: Use -header-filter=.* {{.*}}
diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst
index b65e18c..1a740e5 100644
--- a/clang/docs/AllocToken.rst
+++ b/clang/docs/AllocToken.rst
@@ -49,6 +49,39 @@ change or removal. These may (experimentally) be selected with ``-Xclang
 * ``increment``: This mode assigns a simple, incrementally increasing token ID
   to each allocation site.
 
+The following command-line options affect generated token IDs:
+
+* ``-falloc-token-max=<N>``
+    Configures the maximum number of tokens. No max by default (tokens bounded
+    by ``SIZE_MAX``).
+
+Querying Token IDs with ``__builtin_infer_alloc_token``
+=======================================================
+
+For use cases where the token ID must be known at compile time, Clang provides
+a builtin function:
+
+.. code-block:: c
+
+    size_t __builtin_infer_alloc_token(<args>, ...);
+
+This builtin returns the token ID inferred from its argument expressions, which
+mirror arguments normally passed to any allocation function. The argument
+expressions are **unevaluated**, so it can be used with expressions that would
+have side effects without any runtime impact.
+
+For example, it can be used as follows:
+
+.. code-block:: c
+
+    struct MyType { ... };
+    void *__partition_alloc(size_t size, size_t partition);
+    #define partition_alloc(...) __partition_alloc(__VA_ARGS__, __builtin_infer_alloc_token(__VA_ARGS__))
+
+    void foo(void) {
+        MyType *x = partition_alloc(sizeof(*x));
+    }
+
 Allocation Token Instrumentation
 ================================
 
@@ -70,16 +103,6 @@ example:
     // Instrumented:
     ptr = __alloc_token_malloc(size, <token id>);
 
-The following command-line options affect generated token IDs:
-
-* ``-falloc-token-max=<N>``
-    Configures the maximum number of tokens. No max by default (tokens bounded
-    by ``SIZE_MAX``).
-
-    .. code-block:: console
-
-        % clang++ -fsanitize=alloc-token -falloc-token-max=512 example.cc
-
 Runtime Interface
 -----------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e6e33e7..add1582 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -281,6 +281,9 @@ Non-comprehensive list of changes in this release
   allocator-level heap organization strategies. A feature to instrument all
   allocation functions with a token ID can be enabled via the
   ``-fsanitize=alloc-token`` flag.
+- A builtin ``__builtin_infer_alloc_token(<args>, ...)`` is provided to allow
+  compile-time querying of allocation token IDs, where the builtin arguments
+  mirror those normally passed to an allocation function.
 
 - Clang now rejects the invalid use of ``constexpr`` with ``auto`` and an explicit type in C. (#GH163090)
 
diff --git a/clang/docs/ThreadSafetyAnalysis.rst b/clang/docs/ThreadSafetyAnalysis.rst
index 853a8fa..d0f96f5 100644
--- a/clang/docs/ThreadSafetyAnalysis.rst
+++ b/clang/docs/ThreadSafetyAnalysis.rst
@@ -118,7 +118,7 @@ require exclusive access, while read operations require only shared access.
 At any given moment during program execution, a thread holds a specific set of
 capabilities (e.g. the set of mutexes that it has locked.)  These act like keys
 or tokens that allow the thread to access a given resource.  Just like physical
-security keys, a thread cannot make copy of a capability, nor can it destroy
+security keys, a thread cannot make a copy of a capability, nor can it destroy
 one.  A thread can only release a capability to another thread, or acquire one
 from another thread.  The annotations are deliberately agnostic about the
 exact mechanism used to acquire and release capabilities; it assumes that the
@@ -131,7 +131,7 @@ by calculating an approximation of that set, called the *capability
 environment*.  The capability environment is calculated for every program point,
 and describes the set of capabilities that are statically known to be held, or
 not held, at that particular point.  This environment is a conservative
-approximation of the full set of capabilities that will actually held by a
+approximation of the full set of capabilities that will actually be held by a
 thread at run-time.
 
 
@@ -369,7 +369,7 @@ thread-safe, but too complicated for the analysis to understand.  Reasons for
     void unsafeIncrement() NO_THREAD_SAFETY_ANALYSIS { a++; }
   };
 
-Unlike the other attributes, NO_THREAD_SAFETY_ANALYSIS is not part of the
+Unlike the other attributes, ``NO_THREAD_SAFETY_ANALYSIS`` is not part of the
 interface of a function, and should thus be placed on the function definition
 (in the ``.cc`` or ``.cpp`` file) rather than on the function declaration
 (in the header).
@@ -509,7 +509,7 @@ ASSERT_CAPABILITY(...) and ASSERT_SHARED_CAPABILITY(...)
 *Previously:*  ``ASSERT_EXCLUSIVE_LOCK``, ``ASSERT_SHARED_LOCK``
 
 These are attributes on a function or method which asserts the calling thread
-already holds the given capability, for example by performing a run-time test
+already holds the given capability, for example, by performing a run-time test
 and terminating if the capability is not held.  Presence of this annotation
 causes the analysis to assume the capability is held after calls to the
 annotated function.  See :ref:`mutexheader`, below, for example uses.
@@ -554,19 +554,19 @@ Negative Capabilities
 =====================
 
 Thread Safety Analysis is designed to prevent both race conditions and
-deadlock.  The GUARDED_BY and REQUIRES attributes prevent race conditions, by
+deadlock.  The ``GUARDED_BY`` and ``REQUIRES`` attributes prevent race conditions, by
 ensuring that a capability is held before reading or writing to guarded data,
-and the EXCLUDES attribute prevents deadlock, by making sure that a mutex is
+and the ``EXCLUDES`` attribute prevents deadlock, by making sure that a mutex is
 *not* held.
 
-However, EXCLUDES is an optional attribute, and does not provide the same
-safety guarantee as REQUIRES.  In particular:
+However, ``EXCLUDES`` is an optional attribute, and does not provide the same
+safety guarantee as ``REQUIRES``.  In particular:
 
   * A function which acquires a capability does not have to exclude it.
   * A function which calls a function that excludes a capability does not
-    have transitively exclude that capability.
+    have to transitively exclude that capability.
 
-As a result, EXCLUDES can easily produce false negatives:
+As a result, ``EXCLUDES`` can easily produce false negatives:
 
 .. code-block:: c++
 
@@ -594,8 +594,8 @@ As a result, EXCLUDES can easily produce false negatives:
   };
 
 
-Negative requirements are an alternative EXCLUDES that provide
-a stronger safety guarantee.  A negative requirement uses the  REQUIRES
+Negative requirements are an alternative to ``EXCLUDES`` that provide
+a stronger safety guarantee.  A negative requirement uses the  ``REQUIRES``
 attribute, in conjunction with the ``!`` operator, to indicate that a capability
 should *not* be held.
 
@@ -642,7 +642,7 @@ Frequently Asked Questions
 
 (A) Attributes are part of the formal interface of a function, and should
 always go in the header, where they are visible to anything that includes
-the header.  Attributes in the .cpp file are not visible outside of the
+the header.  Attributes in the ``.cpp`` file are not visible outside of the
 immediate translation unit, which leads to false negatives and false positives.
 
 
@@ -684,7 +684,7 @@ Private Mutexes
 ---------------
 
 Good software engineering practice dictates that mutexes should be private
-members, because the locking mechanism used by a thread-safe class is part of
+members because the locking mechanism used by a thread-safe class is part of
 its internal implementation.  However, private mutexes can sometimes leak into
 the public interface of a class.
 Thread safety attributes follow normal C++ access restrictions, so if ``mu``
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index a2c2021..2b400b0 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5030,6 +5030,12 @@ def HLSLWaveActiveMax : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void (...)";
 }
 
+def HLSLWaveActiveMin : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_wave_active_min"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void (...)";
+}
+
 def HLSLWaveActiveSum : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_wave_active_sum"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 0c85e28..500aa85 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1282,81 +1282,99 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
   def knotdi : X86Builtin<"unsigned long long int(unsigned long long int)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
   def cmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
   def cmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def cmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def cmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
   def cmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def cmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def ucmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def ucmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
   def ucmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def ucmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def ucmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def ucmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
   def ucmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def ucmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def ucmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def ucmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
   def ucmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
 }
 
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index e2e88d4..10028186d 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -16,10 +16,12 @@
 #define LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H
 
 #include "clang/Frontend/DiagnosticRenderer.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
 
 namespace clang {
 
+using llvm::formatted_raw_ostream;
+
 /// Class to encapsulate the logic for formatting and printing a textual
 /// diagnostic message.
 ///
@@ -33,7 +35,7 @@ namespace clang {
 /// DiagnosticClient is implemented through this class as is diagnostic
 /// printing coming out of libclang.
 class TextDiagnostic : public DiagnosticRenderer {
-  raw_ostream &OS;
+  formatted_raw_ostream OS;
   const Preprocessor *PP;
 
 public:
@@ -47,7 +49,7 @@ public:
     unsigned End;
     enum llvm::raw_ostream::Colors Color;
     StyleRange(unsigned S, unsigned E, enum llvm::raw_ostream::Colors C)
-        : Start(S), End(E), Color(C){};
+        : Start(S), End(E), Color(C) {};
   };
 
   /// Print the diagonstic level to a raw_ostream.
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 0d2316f..dad8efd0 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -7677,7 +7677,7 @@ private:
   /// [GNU] asm-clobbers:
   ///         asm-string-literal
   ///         asm-clobbers ',' asm-string-literal
-  /// \endverbatim 
+  /// \endverbatim
   ///
   StmtResult ParseAsmStatement(bool &msAsm);
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8f23001..b3ab82d 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -859,7 +859,7 @@ static bool interp__builtin_carryop(InterpState &S, CodePtr OpPC,
   APSInt RHS = popToAPSInt(S.Stk, RHST);
   APSInt LHS = popToAPSInt(S.Stk, LHST);
 
-  if (CarryOutPtr.isDummy())
+  if (CarryOutPtr.isDummy() || !CarryOutPtr.isBlockPointer())
     return false;
 
   APSInt CarryOut;
@@ -3296,6 +3296,60 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool evalICmpImm(uint8_t Imm, const APSInt &A, const APSInt &B,
+                        bool IsUnsigned) {
+  switch (Imm & 0x7) {
+  case 0x00: // _MM_CMPINT_EQ
+    return (A == B);
+  case 0x01: // _MM_CMPINT_LT
+    return IsUnsigned ? A.ult(B) : A.slt(B);
+  case 0x02: // _MM_CMPINT_LE
+    return IsUnsigned ? A.ule(B) : A.sle(B);
+  case 0x03: // _MM_CMPINT_FALSE
+    return false;
+  case 0x04: // _MM_CMPINT_NE
+    return (A != B);
+  case 0x05: // _MM_CMPINT_NLT
+    return IsUnsigned ? A.ugt(B) : A.sgt(B);
+  case 0x06: // _MM_CMPINT_NLE
+    return IsUnsigned ? A.uge(B) : A.sge(B);
+  case 0x07: // _MM_CMPINT_TRUE
+    return true;
+  default:
+    llvm_unreachable("Invalid Op");
+  }
+}
+
+static bool interp__builtin_ia32_cmp_mask(InterpState &S, CodePtr OpPC,
+                                          const CallExpr *Call, unsigned ID,
+                                          bool IsUnsigned) {
+  assert(Call->getNumArgs() == 4);
+
+  APSInt Mask = popToAPSInt(S, Call->getArg(3));
+  APSInt Opcode = popToAPSInt(S, Call->getArg(2));
+  unsigned CmpOp = static_cast<unsigned>(Opcode.getZExtValue());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+
+  assert(LHS.getNumElems() == RHS.getNumElems());
+
+  APInt RetMask = APInt::getZero(LHS.getNumElems());
+  unsigned VectorLen = LHS.getNumElems();
+  PrimType ElemT = LHS.getFieldDesc()->getPrimType();
+
+  for (unsigned ElemNum = 0; ElemNum < VectorLen; ++ElemNum) {
+    APSInt A, B;
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      A = LHS.elem<T>(ElemNum).toAPSInt();
+      B = RHS.elem<T>(ElemNum).toAPSInt();
+    });
+    RetMask.setBitVal(ElemNum,
+                      Mask[ElemNum] && evalICmpImm(CmpOp, A, B, IsUnsigned));
+  }
+  pushInteger(S, RetMask, Call->getType());
+  return true;
+}
+
 static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
                                             const CallExpr *Call) {
   assert(Call->getNumArgs() == 1);
@@ -4488,6 +4542,35 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_vec_set_v4di:
     return interp__builtin_vec_set(S, OpPC, Call, BuiltinID);
 
+  case X86::BI__builtin_ia32_cmpb128_mask:
+  case X86::BI__builtin_ia32_cmpw128_mask:
+  case X86::BI__builtin_ia32_cmpd128_mask:
+  case X86::BI__builtin_ia32_cmpq128_mask:
+  case X86::BI__builtin_ia32_cmpb256_mask:
+  case X86::BI__builtin_ia32_cmpw256_mask:
+  case X86::BI__builtin_ia32_cmpd256_mask:
+  case X86::BI__builtin_ia32_cmpq256_mask:
+  case X86::BI__builtin_ia32_cmpb512_mask:
+  case X86::BI__builtin_ia32_cmpw512_mask:
+  case X86::BI__builtin_ia32_cmpd512_mask:
+  case X86::BI__builtin_ia32_cmpq512_mask:
+    return interp__builtin_ia32_cmp_mask(S, OpPC, Call, BuiltinID,
+                                         /*IsUnsigned=*/false);
+
+  case X86::BI__builtin_ia32_ucmpb128_mask:
+  case X86::BI__builtin_ia32_ucmpw128_mask:
+  case X86::BI__builtin_ia32_ucmpd128_mask:
+  case X86::BI__builtin_ia32_ucmpq128_mask:
+  case X86::BI__builtin_ia32_ucmpb256_mask:
+  case X86::BI__builtin_ia32_ucmpw256_mask:
+  case X86::BI__builtin_ia32_ucmpd256_mask:
+  case X86::BI__builtin_ia32_ucmpq256_mask:
+  case X86::BI__builtin_ia32_ucmpb512_mask:
+  case X86::BI__builtin_ia32_ucmpw512_mask:
+  case X86::BI__builtin_ia32_ucmpd512_mask:
+  case X86::BI__builtin_ia32_ucmpq512_mask:
+    return interp__builtin_ia32_cmp_mask(S, OpPC, Call, BuiltinID,
+                                         /*IsUnsigned=*/true);
   case X86::BI__builtin_ia32_pslldqi128_byteshift:
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 29ee089..d0404b9 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -15766,6 +15766,89 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     unsigned Idx = static_cast<unsigned>(IdxAPS.getZExtValue() & (N - 1));
     return Success(Vec.getVectorElt(Idx).getInt(), E);
   }
+
+  case clang::X86::BI__builtin_ia32_cmpb128_mask:
+  case clang::X86::BI__builtin_ia32_cmpw128_mask:
+  case clang::X86::BI__builtin_ia32_cmpd128_mask:
+  case clang::X86::BI__builtin_ia32_cmpq128_mask:
+  case clang::X86::BI__builtin_ia32_cmpb256_mask:
+  case clang::X86::BI__builtin_ia32_cmpw256_mask:
+  case clang::X86::BI__builtin_ia32_cmpd256_mask:
+  case clang::X86::BI__builtin_ia32_cmpq256_mask:
+  case clang::X86::BI__builtin_ia32_cmpb512_mask:
+  case clang::X86::BI__builtin_ia32_cmpw512_mask:
+  case clang::X86::BI__builtin_ia32_cmpd512_mask:
+  case clang::X86::BI__builtin_ia32_cmpq512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpb128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpw128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpd128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpq128_mask:
+  case clang::X86::BI__builtin_ia32_ucmpb256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpw256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpd256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpq256_mask:
+  case clang::X86::BI__builtin_ia32_ucmpb512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpw512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpd512_mask:
+  case clang::X86::BI__builtin_ia32_ucmpq512_mask: {
+    assert(E->getNumArgs() == 4);
+
+    bool IsUnsigned =
+        (BuiltinOp >= clang::X86::BI__builtin_ia32_ucmpb128_mask &&
+         BuiltinOp <= clang::X86::BI__builtin_ia32_ucmpq512_mask);
+
+    APValue LHS, RHS;
+    APSInt Mask, Opcode;
+    if (!EvaluateVector(E->getArg(0), LHS, Info) ||
+        !EvaluateVector(E->getArg(1), RHS, Info) ||
+        !EvaluateInteger(E->getArg(2), Opcode, Info) ||
+        !EvaluateInteger(E->getArg(3), Mask, Info))
+      return false;
+
+    assert(LHS.getVectorLength() == RHS.getVectorLength());
+
+    unsigned VectorLen = LHS.getVectorLength();
+    unsigned RetWidth = Mask.getBitWidth();
+
+    APSInt RetMask(llvm::APInt(RetWidth, 0), /*isUnsigned=*/true);
+
+    for (unsigned ElemNum = 0; ElemNum < VectorLen; ++ElemNum) {
+      const APSInt &A = LHS.getVectorElt(ElemNum).getInt();
+      const APSInt &B = RHS.getVectorElt(ElemNum).getInt();
+      bool Result = false;
+
+      switch (Opcode.getExtValue() & 0x7) {
+      case 0: // _MM_CMPINT_EQ
+        Result = (A == B);
+        break;
+      case 1: // _MM_CMPINT_LT
+        Result = IsUnsigned ? A.ult(B) : A.slt(B);
+        break;
+      case 2: // _MM_CMPINT_LE
+        Result = IsUnsigned ? A.ule(B) : A.sle(B);
+        break;
+      case 3: // _MM_CMPINT_FALSE
+        Result = false;
+        break;
+      case 4: // _MM_CMPINT_NE
+        Result = (A != B);
+        break;
+      case 5: // _MM_CMPINT_NLT (>=)
+        Result = IsUnsigned ? A.uge(B) : A.sge(B);
+        break;
+      case 6: // _MM_CMPINT_NLE (>)
+        Result = IsUnsigned ? A.ugt(B) : A.sgt(B);
+        break;
+      case 7: // _MM_CMPINT_TRUE
+        Result = true;
+        break;
+      }
+
+      RetMask.setBitVal(ElemNum, Mask[ElemNum] && Result);
+    }
+
+    return Success(APValue(RetMask), E);
+  }
   }
 }
 
diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index 0411bcc..8de1083 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -75,6 +75,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__BPF_FEATURE_GOTOL");
     Builder.defineMacro("__BPF_FEATURE_ST");
     Builder.defineMacro("__BPF_FEATURE_LOAD_ACQ_STORE_REL");
+    Builder.defineMacro("__BPF_FEATURE_GOTOX");
   }
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 50d585d..e5066fa 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -108,11 +108,11 @@ public:
 
   cir::LongDoubleType getLongDoubleTy(const llvm::fltSemantics &format) const {
     if (&format == &llvm::APFloat::IEEEdouble())
-      return cir::LongDoubleType::get(getContext(), typeCache.DoubleTy);
+      return cir::LongDoubleType::get(getContext(), typeCache.doubleTy);
     if (&format == &llvm::APFloat::x87DoubleExtended())
-      return cir::LongDoubleType::get(getContext(), typeCache.FP80Ty);
+      return cir::LongDoubleType::get(getContext(), typeCache.fP80Ty);
     if (&format == &llvm::APFloat::IEEEquad())
-      return cir::LongDoubleType::get(getContext(), typeCache.FP128Ty);
+      return cir::LongDoubleType::get(getContext(), typeCache.fP128Ty);
     if (&format == &llvm::APFloat::PPCDoubleDouble())
       llvm_unreachable("NYI: PPC double-double format for long double");
     llvm_unreachable("Unsupported format for long double");
@@ -258,17 +258,17 @@ public:
     }
   }
 
-  cir::VoidType getVoidTy() { return typeCache.VoidTy; }
+  cir::VoidType getVoidTy() { return typeCache.voidTy; }
 
-  cir::IntType getSInt8Ty() { return typeCache.SInt8Ty; }
-  cir::IntType getSInt16Ty() { return typeCache.SInt16Ty; }
-  cir::IntType getSInt32Ty() { return typeCache.SInt32Ty; }
-  cir::IntType getSInt64Ty() { return typeCache.SInt64Ty; }
+  cir::IntType getSInt8Ty() { return typeCache.sInt8Ty; }
+  cir::IntType getSInt16Ty() { return typeCache.sInt16Ty; }
+  cir::IntType getSInt32Ty() { return typeCache.sInt32Ty; }
+  cir::IntType getSInt64Ty() { return typeCache.sInt64Ty; }
 
-  cir::IntType getUInt8Ty() { return typeCache.UInt8Ty; }
-  cir::IntType getUInt16Ty() { return typeCache.UInt16Ty; }
-  cir::IntType getUInt32Ty() { return typeCache.UInt32Ty; }
-  cir::IntType getUInt64Ty() { return typeCache.UInt64Ty; }
+  cir::IntType getUInt8Ty() { return typeCache.uInt8Ty; }
+  cir::IntType getUInt16Ty() { return typeCache.uInt16Ty; }
+  cir::IntType getUInt32Ty() { return typeCache.uInt32Ty; }
+  cir::IntType getUInt64Ty() { return typeCache.uInt64Ty; }
 
   cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal);
 
@@ -280,21 +280,21 @@ public:
                              llvm::APFloat fpVal);
 
   bool isInt8Ty(mlir::Type i) {
-    return i == typeCache.UInt8Ty || i == typeCache.SInt8Ty;
+    return i == typeCache.uInt8Ty || i == typeCache.sInt8Ty;
   }
   bool isInt16Ty(mlir::Type i) {
-    return i == typeCache.UInt16Ty || i == typeCache.SInt16Ty;
+    return i == typeCache.uInt16Ty || i == typeCache.sInt16Ty;
   }
   bool isInt32Ty(mlir::Type i) {
-    return i == typeCache.UInt32Ty || i == typeCache.SInt32Ty;
+    return i == typeCache.uInt32Ty || i == typeCache.sInt32Ty;
   }
   bool isInt64Ty(mlir::Type i) {
-    return i == typeCache.UInt64Ty || i == typeCache.SInt64Ty;
+    return i == typeCache.uInt64Ty || i == typeCache.sInt64Ty;
   }
   bool isInt(mlir::Type i) { return mlir::isa<cir::IntType>(i); }
 
   // Fetch the type representing a pointer to unsigned int8 values.
-  cir::PointerType getUInt8PtrTy() { return typeCache.UInt8PtrTy; }
+  cir::PointerType getUInt8PtrTy() { return typeCache.uInt8PtrTy; }
 
   /// Get a CIR anonymous record type.
   cir::RecordType getAnonRecordTy(llvm::ArrayRef<mlir::Type> members,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 5046e09..a829678 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -362,7 +362,7 @@ static Address applyNonVirtualAndVirtualOffset(
   // not bytes.  So the pointer must be cast to a byte pointer and back.
 
   mlir::Value ptr = addr.getPointer();
-  mlir::Type charPtrType = cgf.cgm.UInt8PtrTy;
+  mlir::Type charPtrType = cgf.cgm.uInt8PtrTy;
   mlir::Value charPtr = cgf.getBuilder().createBitcast(ptr, charPtrType);
   mlir::Value adjusted = cir::PtrStrideOp::create(
       cgf.getBuilder(), loc, charPtrType, charPtr, baseOffset);
@@ -1105,7 +1105,7 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase,
     // We're the complete constructor, so get the VTT by name.
     cir::GlobalOp vtt = cgm.getVTables().getAddrOfVTT(rd);
     return builder.createVTTAddrPoint(
-        loc, builder.getPointerTo(cgm.VoidPtrTy),
+        loc, builder.getPointerTo(cgm.voidPtrTy),
         mlir::FlatSymbolRefAttr::get(vtt.getSymNameAttr()), subVTTIndex);
   }
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
index 8723a6e..930ae55 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
@@ -55,7 +55,7 @@ cir::CallOp CIRGenFunction::emitCoroIDBuiltinCall(mlir::Location loc,
   if (!builtin) {
     fnOp = cgm.createCIRBuiltinFunction(
         loc, cgm.builtinCoroId,
-        cir::FuncType::get({int32Ty, VoidPtrTy, VoidPtrTy, VoidPtrTy}, int32Ty),
+        cir::FuncType::get({int32Ty, voidPtrTy, voidPtrTy, voidPtrTy}, int32Ty),
         /*FD=*/nullptr);
     assert(fnOp && "should always succeed");
   } else {
@@ -75,7 +75,7 @@ cir::CallOp CIRGenFunction::emitCoroAllocBuiltinCall(mlir::Location loc) {
   cir::FuncOp fnOp;
   if (!builtin) {
     fnOp = cgm.createCIRBuiltinFunction(loc, cgm.builtinCoroAlloc,
-                                        cir::FuncType::get({UInt32Ty}, boolTy),
+                                        cir::FuncType::get({uInt32Ty}, boolTy),
                                         /*fd=*/nullptr);
     assert(fnOp && "should always succeed");
   } else {
@@ -95,7 +95,7 @@ CIRGenFunction::emitCoroBeginBuiltinCall(mlir::Location loc,
   if (!builtin) {
     fnOp = cgm.createCIRBuiltinFunction(
         loc, cgm.builtinCoroBegin,
-        cir::FuncType::get({UInt32Ty, VoidPtrTy}, VoidPtrTy),
+        cir::FuncType::get({uInt32Ty, voidPtrTy}, voidPtrTy),
         /*fd=*/nullptr);
     assert(fnOp && "should always succeed");
   } else {
@@ -110,7 +110,7 @@ CIRGenFunction::emitCoroBeginBuiltinCall(mlir::Location loc,
 mlir::LogicalResult
 CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
   mlir::Location openCurlyLoc = getLoc(s.getBeginLoc());
-  cir::ConstantOp nullPtrCst = builder.getNullPtr(VoidPtrTy, openCurlyLoc);
+  cir::ConstantOp nullPtrCst = builder.getNullPtr(voidPtrTy, openCurlyLoc);
 
   auto fn = mlir::cast<cir::FuncOp>(curFn);
   fn.setCoroutine(true);
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 5667273..aeea0ef 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -80,13 +80,13 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
     assert(!cir::MissingFeatures::openMP());
     if (!didCallStackSave) {
       // Save the stack.
-      cir::PointerType defaultTy = AllocaInt8PtrTy;
+      cir::PointerType defaultTy = allocaInt8PtrTy;
       CharUnits align = CharUnits::fromQuantity(
           cgm.getDataLayout().getAlignment(defaultTy, false));
       Address stack = createTempAlloca(defaultTy, align, loc, "saved_stack");
 
       mlir::Value v = builder.createStackSave(loc, defaultTy);
-      assert(v.getType() == AllocaInt8PtrTy);
+      assert(v.getType() == allocaInt8PtrTy);
       builder.createStore(loc, v, stack);
 
       didCallStackSave = true;
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index df6ee56..5ccb431 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -2529,7 +2529,7 @@ CIRGenFunction::emitConditionalBlocks(const AbstractConditionalOperator *e,
 
   // If both arms are void, so be it.
   if (!yieldTy)
-    yieldTy = VoidTy;
+    yieldTy = voidTy;
 
   // Insert required yields.
   for (mlir::OpBuilder::InsertPoint &toInsert : insertPoints) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 8fe0d9b4..3d3030c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -490,7 +490,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
   for (uint64_t i = 0; i != numInitElements; ++i) {
     // Advance to the next element.
     if (i > 0) {
-      one = builder.getConstantInt(loc, cgf.PtrDiffTy, i);
+      one = builder.getConstantInt(loc, cgf.ptrDiffTy, i);
       element = builder.createPtrStride(loc, begin, one);
     }
 
@@ -512,7 +512,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
         cgf.getTypes().isZeroInitializable(elementType))) {
     // Advance to the start of the rest of the array.
     if (numInitElements) {
-      one = builder.getConstantInt(loc, cgf.PtrDiffTy, 1);
+      one = builder.getConstantInt(loc, cgf.ptrDiffTy, 1);
       element = cir::PtrStrideOp::create(builder, loc, cirElementPtrType,
                                          element, one);
     }
@@ -526,7 +526,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
 
     // Compute the end of array
     cir::ConstantOp numArrayElementsConst = builder.getConstInt(
-        loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), numArrayElements);
+        loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), numArrayElements);
     mlir::Value end = cir::PtrStrideOp::create(builder, loc, cirElementPtrType,
                                                begin, numArrayElementsConst);
 
@@ -563,7 +563,7 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
 
           // Advance pointer and store them to temporary variable
           cir::ConstantOp one = builder.getConstInt(
-              loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), 1);
+              loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), 1);
           auto nextElement = cir::PtrStrideOp::create(
               builder, loc, cirElementPtrType, currentElement, one);
           cgf.emitStoreThroughLValue(RValue::get(nextElement), tmpLV);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 7a35382..9dd9b6d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -257,12 +257,12 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
   if (!e->isArray()) {
     CharUnits typeSize = cgf.getContext().getTypeSizeInChars(type);
     sizeWithoutCookie = cgf.getBuilder().getConstant(
-        loc, cir::IntAttr::get(cgf.SizeTy, typeSize.getQuantity()));
+        loc, cir::IntAttr::get(cgf.sizeTy, typeSize.getQuantity()));
     return sizeWithoutCookie;
   }
 
   // The width of size_t.
-  unsigned sizeWidth = cgf.cgm.getDataLayout().getTypeSizeInBits(cgf.SizeTy);
+  unsigned sizeWidth = cgf.cgm.getDataLayout().getTypeSizeInBits(cgf.sizeTy);
 
   // The number of elements can be have an arbitrary integer type;
   // essentially, we need to multiply it by a constant factor, add a
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 928e5aa..6af87a0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -46,7 +46,7 @@ namespace {
 class ConstExprEmitter;
 
 static mlir::TypedAttr computePadding(CIRGenModule &cgm, CharUnits size) {
-  mlir::Type eltTy = cgm.UCharTy;
+  mlir::Type eltTy = cgm.uCharTy;
   clang::CharUnits::QuantityType arSize = size.getQuantity();
   CIRGenBuilderTy &bld = cgm.getBuilder();
   if (size > CharUnits::One()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index db6878d..119314f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -762,9 +762,9 @@ public:
         // FIXME(cir): For now lets pretend we shouldn't use the conversion
         // intrinsics and insert a cast here unconditionally.
         src = builder.createCast(cgf.getLoc(loc), cir::CastKind::floating, src,
-                                 cgf.FloatTy);
+                                 cgf.floatTy);
         srcType = cgf.getContext().FloatTy;
-        mlirSrcType = cgf.FloatTy;
+        mlirSrcType = cgf.floatTy;
       }
     }
 
@@ -1738,7 +1738,7 @@ mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) {
   //
   // See more in `EmitSub` in CGExprScalar.cpp.
   assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee());
-  return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.PtrDiffTy,
+  return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.ptrDiffTy,
                                 ops.lhs, ops.rhs);
 }
 
@@ -2220,7 +2220,7 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr(
                                      "sizeof operator for VariableArrayType",
                                      e->getStmtClassName());
       return builder.getConstant(
-          loc, cir::IntAttr::get(cgf.cgm.UInt64Ty,
+          loc, cir::IntAttr::get(cgf.cgm.uInt64Ty,
                                  llvm::APSInt(llvm::APInt(64, 1), true)));
     }
   } else if (e->getKind() == UETT_OpenMPRequiredSimdAlign) {
@@ -2228,12 +2228,12 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr(
         e->getSourceRange(), "sizeof operator for OpenMpRequiredSimdAlign",
         e->getStmtClassName());
     return builder.getConstant(
-        loc, cir::IntAttr::get(cgf.cgm.UInt64Ty,
+        loc, cir::IntAttr::get(cgf.cgm.uInt64Ty,
                                llvm::APSInt(llvm::APInt(64, 1), true)));
   }
 
   return builder.getConstant(
-      loc, cir::IntAttr::get(cgf.cgm.UInt64Ty,
+      loc, cir::IntAttr::get(cgf.cgm.uInt64Ty,
                              e->EvaluateKnownConstInt(cgf.getContext())));
 }
 
@@ -2329,14 +2329,14 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
 
     mlir::Value lhs = Visit(lhsExpr);
     if (!lhs) {
-      lhs = builder.getNullValue(cgf.VoidTy, loc);
+      lhs = builder.getNullValue(cgf.voidTy, loc);
       lhsIsVoid = true;
     }
 
     mlir::Value rhs = Visit(rhsExpr);
     if (lhsIsVoid) {
       assert(!rhs && "lhs and rhs types must match");
-      rhs = builder.getNullValue(cgf.VoidTy, loc);
+      rhs = builder.getNullValue(cgf.voidTy, loc);
     }
 
     return builder.createSelect(loc, condV, lhs, rhs);
@@ -2381,7 +2381,7 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
   if (!insertPoints.empty()) {
     // If both arms are void, so be it.
     if (!yieldTy)
-      yieldTy = cgf.VoidTy;
+      yieldTy = cgf.voidTy;
 
     // Insert required yields.
     for (mlir::OpBuilder::InsertPoint &toInsert : insertPoints) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 58feb36..71ff20a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -1008,7 +1008,7 @@ CIRGenFunction::emitArrayLength(const clang::ArrayType *origArrayType,
   if (isa<VariableArrayType>(arrayType)) {
     assert(cir::MissingFeatures::vlas());
     cgm.errorNYI(*currSrcLoc, "VLAs");
-    return builder.getConstInt(*currSrcLoc, SizeTy, 0);
+    return builder.getConstInt(*currSrcLoc, sizeTy, 0);
   }
 
   uint64_t countFromCLAs = 1;
@@ -1037,7 +1037,7 @@ CIRGenFunction::emitArrayLength(const clang::ArrayType *origArrayType,
   }
 
   baseType = eltType;
-  return builder.getConstInt(*currSrcLoc, SizeTy, countFromCLAs);
+  return builder.getConstInt(*currSrcLoc, sizeTy, countFromCLAs);
 }
 
 mlir::Value CIRGenFunction::emitAlignmentAssumption(
@@ -1074,7 +1074,7 @@ CIRGenFunction::getVLASize(const VariableArrayType *type) {
     elementType = type->getElementType();
     mlir::Value vlaSize = vlaSizeMap[type->getSizeExpr()];
     assert(vlaSize && "no size for VLA!");
-    assert(vlaSize.getType() == SizeTy);
+    assert(vlaSize.getType() == sizeTy);
 
     if (!numElements) {
       numElements = vlaSize;
@@ -1188,7 +1188,7 @@ void CIRGenFunction::emitVariablyModifiedType(QualType type) {
           // Always zexting here would be wrong if it weren't
           // undefined behavior to have a negative bound.
           // FIXME: What about when size's type is larger than size_t?
-          entry = builder.createIntCast(size, SizeTy);
+          entry = builder.createIntCast(size, sizeTy);
         }
       }
       type = vat->getElementType();
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index 88fedf1..f603f5ec 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -1846,13 +1846,13 @@ mlir::Value CIRGenItaniumCXXABI::getVirtualBaseClassOffset(
     const CXXRecordDecl *classDecl, const CXXRecordDecl *baseClassDecl) {
   CIRGenBuilderTy &builder = cgf.getBuilder();
   mlir::Value vtablePtr = cgf.getVTablePtr(loc, thisAddr, classDecl);
-  mlir::Value vtableBytePtr = builder.createBitcast(vtablePtr, cgm.UInt8PtrTy);
+  mlir::Value vtableBytePtr = builder.createBitcast(vtablePtr, cgm.uInt8PtrTy);
   CharUnits vbaseOffsetOffset =
       cgm.getItaniumVTableContext().getVirtualBaseOffsetOffset(classDecl,
                                                                baseClassDecl);
   mlir::Value offsetVal =
       builder.getSInt64(vbaseOffsetOffset.getQuantity(), loc);
-  auto vbaseOffsetPtr = cir::PtrStrideOp::create(builder, loc, cgm.UInt8PtrTy,
+  auto vbaseOffsetPtr = cir::PtrStrideOp::create(builder, loc, cgm.uInt8PtrTy,
                                                  vtableBytePtr, offsetVal);
 
   mlir::Value vbaseOffset;
@@ -1861,9 +1861,9 @@ mlir::Value CIRGenItaniumCXXABI::getVirtualBaseClassOffset(
     cgm.errorNYI(loc, "getVirtualBaseClassOffset: relative layout");
   } else {
     mlir::Value offsetPtr = builder.createBitcast(
-        vbaseOffsetPtr, builder.getPointerTo(cgm.PtrDiffTy));
+        vbaseOffsetPtr, builder.getPointerTo(cgm.ptrDiffTy));
     vbaseOffset = builder.createLoad(
-        loc, Address(offsetPtr, cgm.PtrDiffTy, cgf.getPointerAlign()));
+        loc, Address(offsetPtr, cgm.ptrDiffTy, cgf.getPointerAlign()));
   }
   return vbaseOffset;
 }
@@ -2244,7 +2244,7 @@ Address CIRGenItaniumCXXABI::initializeArrayCookie(CIRGenFunction &cgf,
 
   // Write the number of elements into the appropriate slot.
   Address numElementsPtr =
-      cookiePtr.withElementType(cgf.getBuilder(), cgf.SizeTy);
+      cookiePtr.withElementType(cgf.getBuilder(), cgf.sizeTy);
   cgf.getBuilder().createStore(loc, numElements, numElementsPtr);
 
   // Finally, compute a pointer to the actual data buffer by skipping
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 46adfe2..9f9b2db 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -67,28 +67,28 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
       abi(createCXXABI(*this)), genTypes(*this), vtables(*this) {
 
   // Initialize cached types
-  VoidTy = cir::VoidType::get(&getMLIRContext());
-  VoidPtrTy = cir::PointerType::get(VoidTy);
-  SInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/true);
-  SInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/true);
-  SInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true);
-  SInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true);
-  SInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true);
-  UInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false);
-  UInt8PtrTy = cir::PointerType::get(UInt8Ty);
+  voidTy = cir::VoidType::get(&getMLIRContext());
+  voidPtrTy = cir::PointerType::get(voidTy);
+  sInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/true);
+  sInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/true);
+  sInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true);
+  sInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true);
+  sInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true);
+  uInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false);
+  uInt8PtrTy = cir::PointerType::get(uInt8Ty);
   cirAllocaAddressSpace = getTargetCIRGenInfo().getCIRAllocaAddressSpace();
-  UInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/false);
-  UInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false);
-  UInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false);
-  UInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false);
-  FP16Ty = cir::FP16Type::get(&getMLIRContext());
-  BFloat16Ty = cir::BF16Type::get(&getMLIRContext());
-  FloatTy = cir::SingleType::get(&getMLIRContext());
-  DoubleTy = cir::DoubleType::get(&getMLIRContext());
-  FP80Ty = cir::FP80Type::get(&getMLIRContext());
-  FP128Ty = cir::FP128Type::get(&getMLIRContext());
-
-  AllocaInt8PtrTy = cir::PointerType::get(UInt8Ty, cirAllocaAddressSpace);
+  uInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/false);
+  uInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false);
+  uInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false);
+  uInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false);
+  fP16Ty = cir::FP16Type::get(&getMLIRContext());
+  bFloat16Ty = cir::BF16Type::get(&getMLIRContext());
+  floatTy = cir::SingleType::get(&getMLIRContext());
+  doubleTy = cir::DoubleType::get(&getMLIRContext());
+  fP80Ty = cir::FP80Type::get(&getMLIRContext());
+  fP128Ty = cir::FP128Type::get(&getMLIRContext());
+
+  allocaInt8PtrTy = cir::PointerType::get(uInt8Ty, cirAllocaAddressSpace);
 
   PointerAlignInBytes =
       astContext
@@ -97,16 +97,16 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
           .getQuantity();
 
   const unsigned charSize = astContext.getTargetInfo().getCharWidth();
-  UCharTy = cir::IntType::get(&getMLIRContext(), charSize, /*isSigned=*/false);
+  uCharTy = cir::IntType::get(&getMLIRContext(), charSize, /*isSigned=*/false);
 
   // TODO(CIR): Should be updated once TypeSizeInfoAttr is upstreamed
   const unsigned sizeTypeSize =
       astContext.getTypeSize(astContext.getSignedSizeType());
   SizeSizeInBytes = astContext.toCharUnitsFromBits(sizeTypeSize).getQuantity();
   // In CIRGenTypeCache, UIntPtrTy and SizeType are fields of the same union
-  UIntPtrTy =
+  uIntPtrTy =
       cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/false);
-  PtrDiffTy =
+  ptrDiffTy =
       cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/true);
 
   std::optional<cir::SourceLanguage> sourceLanguage = getCIRSourceLanguage();
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index be063033..890f8a6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -617,11 +617,11 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
   if (const auto *cat = cgf.getContext().getAsConstantArrayType(origType)) {
     // If we're in an array, we have to emit the combiner for each element of
     // the array.
-    auto itrTy = mlir::cast<cir::IntType>(cgf.PtrDiffTy);
+    auto itrTy = mlir::cast<cir::IntType>(cgf.ptrDiffTy);
     auto itrPtrTy = cir::PointerType::get(itrTy);
 
     mlir::Value zero =
-        builder.getConstInt(loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), 0);
+        builder.getConstInt(loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), 0);
     mlir::Value itr =
         cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "itr",
                               cgf.cgm.getSize(cgf.getPointerAlign()));
@@ -633,7 +633,7 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto loadItr = cir::LoadOp::create(builder, loc, {itr});
           mlir::Value arraySize = builder.getConstInt(
-              loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), cat->getZExtSize());
+              loc, mlir::cast<cir::IntType>(cgf.ptrDiffTy), cat->getZExtSize());
           auto cmp = builder.createCompare(loc, cir::CmpOpKind::lt, loadItr,
                                            arraySize);
           builder.createCondition(cmp);
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index ff5842c..0f63e91 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -26,47 +26,47 @@ struct CIRGenTypeCache {
   CIRGenTypeCache() {}
 
   // ClangIR void type
-  cir::VoidType VoidTy;
+  cir::VoidType voidTy;
 
   // ClangIR signed integral types of common sizes
-  cir::IntType SInt8Ty;
-  cir::IntType SInt16Ty;
-  cir::IntType SInt32Ty;
-  cir::IntType SInt64Ty;
-  cir::IntType SInt128Ty;
+  cir::IntType sInt8Ty;
+  cir::IntType sInt16Ty;
+  cir::IntType sInt32Ty;
+  cir::IntType sInt64Ty;
+  cir::IntType sInt128Ty;
 
   // ClangIR unsigned integral type of common sizes
-  cir::IntType UInt8Ty;
-  cir::IntType UInt16Ty;
-  cir::IntType UInt32Ty;
-  cir::IntType UInt64Ty;
-  cir::IntType UInt128Ty;
+  cir::IntType uInt8Ty;
+  cir::IntType uInt16Ty;
+  cir::IntType uInt32Ty;
+  cir::IntType uInt64Ty;
+  cir::IntType uInt128Ty;
 
   // ClangIR floating-point types with fixed formats
-  cir::FP16Type FP16Ty;
-  cir::BF16Type BFloat16Ty;
-  cir::SingleType FloatTy;
-  cir::DoubleType DoubleTy;
-  cir::FP80Type FP80Ty;
-  cir::FP128Type FP128Ty;
+  cir::FP16Type fP16Ty;
+  cir::BF16Type bFloat16Ty;
+  cir::SingleType floatTy;
+  cir::DoubleType doubleTy;
+  cir::FP80Type fP80Ty;
+  cir::FP128Type fP128Ty;
 
   /// ClangIR char
-  mlir::Type UCharTy;
+  mlir::Type uCharTy;
 
   /// intptr_t, size_t, and ptrdiff_t, which we assume are the same size.
   union {
-    mlir::Type UIntPtrTy;
-    mlir::Type SizeTy;
+    mlir::Type uIntPtrTy;
+    mlir::Type sizeTy;
   };
 
-  mlir::Type PtrDiffTy;
+  mlir::Type ptrDiffTy;
 
   /// void* in address space 0
-  cir::PointerType VoidPtrTy;
-  cir::PointerType UInt8PtrTy;
+  cir::PointerType voidPtrTy;
+  cir::PointerType uInt8PtrTy;
 
   /// void* in alloca address space
-  cir::PointerType AllocaInt8PtrTy;
+  cir::PointerType allocaInt8PtrTy;
 
   /// The size and alignment of a pointer into the generic address space.
   union {
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index d1b91d0..03618d4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -71,7 +71,7 @@ mlir::Type CIRGenTypes::convertFunctionTypeInternal(QualType qft) {
   if (!isFuncTypeConvertible(ft)) {
     cgm.errorNYI(SourceLocation(), "function type involving an incomplete type",
                  qft);
-    return cir::FuncType::get(SmallVector<mlir::Type, 1>{}, cgm.VoidTy);
+    return cir::FuncType::get(SmallVector<mlir::Type, 1>{}, cgm.voidTy);
   }
 
   const CIRGenFunctionInfo *fi;
@@ -298,7 +298,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     switch (cast<BuiltinType>(ty)->getKind()) {
     // void
     case BuiltinType::Void:
-      resultType = cgm.VoidTy;
+      resultType = cgm.voidTy;
       break;
 
     // bool
@@ -338,42 +338,42 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
 
     // Floating-point types
     case BuiltinType::Float16:
-      resultType = cgm.FP16Ty;
+      resultType = cgm.fP16Ty;
       break;
     case BuiltinType::Half:
       if (astContext.getLangOpts().NativeHalfType ||
           !astContext.getTargetInfo().useFP16ConversionIntrinsics()) {
-        resultType = cgm.FP16Ty;
+        resultType = cgm.fP16Ty;
       } else {
         cgm.errorNYI(SourceLocation(), "processing of built-in type", type);
-        resultType = cgm.SInt32Ty;
+        resultType = cgm.sInt32Ty;
       }
       break;
     case BuiltinType::BFloat16:
-      resultType = cgm.BFloat16Ty;
+      resultType = cgm.bFloat16Ty;
       break;
     case BuiltinType::Float:
       assert(&astContext.getFloatTypeSemantics(type) ==
                  &llvm::APFloat::IEEEsingle() &&
              "ClangIR NYI: 'float' in a format other than IEEE 32-bit");
-      resultType = cgm.FloatTy;
+      resultType = cgm.floatTy;
       break;
     case BuiltinType::Double:
       assert(&astContext.getFloatTypeSemantics(type) ==
                  &llvm::APFloat::IEEEdouble() &&
              "ClangIR NYI: 'double' in a format other than IEEE 64-bit");
-      resultType = cgm.DoubleTy;
+      resultType = cgm.doubleTy;
       break;
     case BuiltinType::LongDouble:
       resultType =
           builder.getLongDoubleTy(astContext.getFloatTypeSemantics(type));
       break;
     case BuiltinType::Float128:
-      resultType = cgm.FP128Ty;
+      resultType = cgm.fP128Ty;
       break;
     case BuiltinType::Ibm128:
       cgm.errorNYI(SourceLocation(), "processing of built-in type", type);
-      resultType = cgm.SInt32Ty;
+      resultType = cgm.sInt32Ty;
       break;
 
     case BuiltinType::NullPtr:
@@ -386,7 +386,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
 
     default:
       cgm.errorNYI(SourceLocation(), "processing of built-in type", type);
-      resultType = cgm.SInt32Ty;
+      resultType = cgm.sInt32Ty;
       break;
     }
     break;
@@ -439,7 +439,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     // int X[] -> [0 x int], unless the element type is not sized.  If it is
     // unsized (e.g. an incomplete record) just use [0 x i8].
     if (!cir::isSized(elemTy)) {
-      elemTy = cgm.SInt8Ty;
+      elemTy = cgm.sInt8Ty;
     }
 
     resultType = cir::ArrayType::get(elemTy, 0);
@@ -454,7 +454,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     // i8 just to have a concrete type"
     if (!cir::isSized(elemTy)) {
       cgm.errorNYI(SourceLocation(), "arrays of undefined struct type", type);
-      resultType = cgm.UInt32Ty;
+      resultType = cgm.uInt32Ty;
       break;
     }
 
@@ -477,7 +477,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     // Return a placeholder 'i32' type.  This can be changed later when the
     // type is defined (see UpdateCompletedType), but is likely to be the
     // "right" answer.
-    resultType = cgm.UInt32Ty;
+    resultType = cgm.uInt32Ty;
     break;
   }
 
@@ -490,7 +490,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     const auto *bitIntTy = cast<BitIntType>(type);
     if (bitIntTy->getNumBits() > cir::IntType::maxBitwidth()) {
       cgm.errorNYI(SourceLocation(), "large _BitInt type", type);
-      resultType = cgm.SInt32Ty;
+      resultType = cgm.sInt32Ty;
     } else {
       resultType = cir::IntType::get(&getMLIRContext(), bitIntTy->getNumBits(),
                                      bitIntTy->isSigned());
@@ -515,7 +515,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
   default:
     cgm.errorNYI(SourceLocation(), "processing of type",
                  type->getTypeClassName());
-    resultType = cgm.SInt32Ty;
+    resultType = cgm.sInt32Ty;
     break;
   }
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 2d2ef42..7ba03ce 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -286,14 +286,14 @@ void cir::ConditionOp::getSuccessorRegions(
   // Parent is a loop: condition may branch to the body or to the parent op.
   if (auto loopOp = dyn_cast<LoopOpInterface>(getOperation()->getParentOp())) {
     regions.emplace_back(&loopOp.getBody(), loopOp.getBody().getArguments());
-    regions.emplace_back(loopOp->getResults());
+    regions.emplace_back(getOperation(), loopOp->getResults());
   }
 
   assert(!cir::MissingFeatures::awaitOp());
 }
 
 MutableOperandRange
-cir::ConditionOp::getMutableSuccessorOperands(RegionBranchPoint point) {
+cir::ConditionOp::getMutableSuccessorOperands(RegionSuccessor point) {
   // No values are yielded to the successor region.
   return MutableOperandRange(getOperation(), 0, 0);
 }
@@ -989,7 +989,8 @@ void cir::IfOp::getSuccessorRegions(mlir::RegionBranchPoint point,
                                     SmallVectorImpl<RegionSuccessor> &regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -1039,7 +1040,7 @@ void cir::ScopeOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // The only region always branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getODSResults(0)));
+    regions.push_back(RegionSuccessor(getOperation(), getODSResults(0)));
     return;
   }
 
@@ -1124,7 +1125,8 @@ Block *cir::BrCondOp::getSuccessorForOperands(ArrayRef<Attribute> operands) {
 void cir::CaseOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
   regions.push_back(RegionSuccessor(&getCaseRegion()));
@@ -1188,7 +1190,8 @@ static void printSwitchOp(OpAsmPrinter &p, cir::SwitchOp op,
 void cir::SwitchOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &region) {
   if (!point.isParent()) {
-    region.push_back(RegionSuccessor());
+    region.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -1402,7 +1405,8 @@ void cir::GlobalOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // The `ctor` and `dtor` regions always branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -1961,7 +1965,7 @@ void cir::TernaryOp::getSuccessorRegions(
     mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // The `true` and the `false` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(this->getODSResults(0)));
+    regions.push_back(RegionSuccessor(getOperation(), this->getODSResults(0)));
     return;
   }
 
@@ -2978,7 +2982,8 @@ void cir::TryOp::getSuccessorRegions(
     llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
   // The `try` and the `catchers` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(mlir::RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
index 21c96fe..ca7554e 100644
--- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
@@ -606,10 +606,12 @@ public:
     // `cir.try_call`.
     llvm::SmallVector<cir::CallOp, 4> callsToRewrite;
     tryOp.getTryRegion().walk([&](CallOp op) {
+      if (op.getNothrow())
+        return;
+
       // Only grab calls within immediate closest TryOp scope.
       if (op->getParentOfType<cir::TryOp>() != tryOp)
         return;
-      assert(!cir::MissingFeatures::opCallExceptionAttr());
       callsToRewrite.push_back(op);
     });
 
diff --git a/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp b/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp
index 0ce5017..6de51f12 100644
--- a/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp
+++ b/clang/lib/CIR/Interfaces/CIRLoopOpInterface.cpp
@@ -17,7 +17,7 @@ namespace cir {
 void LoopOpInterface::getLoopOpSuccessorRegions(
     LoopOpInterface op, mlir::RegionBranchPoint point,
     llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
-  assert(point.isParent() || point.getRegionOrNull());
+  assert(point.isParent() || point.getTerminatorPredecessorOrNull());
 
   // Branching to first region: go to condition or body (do-while).
   if (point.isParent()) {
@@ -25,15 +25,18 @@ void LoopOpInterface::getLoopOpSuccessorRegions(
     return;
   }
 
+  mlir::Region *parentRegion =
+      point.getTerminatorPredecessorOrNull()->getParentRegion();
+
   // Branching from condition: go to body or exit.
-  if (&op.getCond() == point.getRegionOrNull()) {
-    regions.emplace_back(mlir::RegionSuccessor(op->getResults()));
+  if (&op.getCond() == parentRegion) {
+    regions.emplace_back(mlir::RegionSuccessor(op, op->getResults()));
     regions.emplace_back(&op.getBody(), op.getBody().getArguments());
     return;
   }
 
   // Branching from body: go to step (for) or condition.
-  if (&op.getBody() == point.getRegionOrNull()) {
+  if (&op.getBody() == parentRegion) {
     // FIXME(cir): Should we consider break/continue statements here?
     mlir::Region *afterBody =
         (op.maybeGetStep() ? op.maybeGetStep() : &op.getCond());
@@ -42,7 +45,7 @@ void LoopOpInterface::getLoopOpSuccessorRegions(
   }
 
   // Branching from step: go to condition.
-  if (op.maybeGetStep() == point.getRegionOrNull()) {
+  if (op.maybeGetStep() == parentRegion) {
     regions.emplace_back(&op.getCond(), op.getCond().getArguments());
     return;
   }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index aefc262..3c31314 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -800,16 +800,6 @@ static void addSanitizers(const Triple &TargetTriple,
       MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles,
                                         PB.getVirtualFileSystemPtr()));
     }
-
-    if (LangOpts.Sanitize.has(SanitizerKind::AllocToken)) {
-      if (Level == OptimizationLevel::O0) {
-        // The default pass builder only infers libcall function attrs when
-        // optimizing, so we insert it here because we need it for accurate
-        // memory allocation function detection.
-        MPM.addPass(InferFunctionAttrsPass());
-      }
-      MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts)));
-    }
   };
   if (ClSanitizeOnOptimizerEarlyEP) {
     PB.registerOptimizerEarlyEPCallback(
@@ -852,6 +842,23 @@ static void addSanitizers(const Triple &TargetTriple,
   }
 }
 
+static void addAllocTokenPass(const Triple &TargetTriple,
+                              const CodeGenOptions &CodeGenOpts,
+                              const LangOptions &LangOpts, PassBuilder &PB) {
+  PB.registerOptimizerLastEPCallback([&](ModulePassManager &MPM,
+                                         OptimizationLevel Level,
+                                         ThinOrFullLTOPhase) {
+    if (Level == OptimizationLevel::O0 &&
+        LangOpts.Sanitize.has(SanitizerKind::AllocToken)) {
+      // The default pass builder only infers libcall function attrs when
+      // optimizing, so we insert it here because we need it for accurate
+      // memory allocation function detection with -fsanitize=alloc-token.
+      MPM.addPass(InferFunctionAttrsPass());
+    }
+    MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts)));
+  });
+}
+
 void EmitAssemblyHelper::RunOptimizationPipeline(
     BackendAction Action, std::unique_ptr<raw_pwrite_stream> &OS,
     std::unique_ptr<llvm::ToolOutputFile> &ThinLinkOS, BackendConsumer *BC) {
@@ -1106,6 +1113,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     if (!IsThinLTOPostLink) {
       addSanitizers(TargetTriple, CodeGenOpts, LangOpts, PB);
       addKCFIPass(TargetTriple, LangOpts, PB);
+      addAllocTokenPass(TargetTriple, CodeGenOpts, LangOpts, PB);
     }
 
     if (std::optional<GCOVOptions> Options =
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index fd14cd6..b81e0d0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4506,6 +4506,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(AI);
   }
 
+  case Builtin::BI__builtin_infer_alloc_token: {
+    llvm::MDNode *MDN = buildAllocToken(E);
+    llvm::Value *MDV = MetadataAsValue::get(getLLVMContext(), MDN);
+    llvm::Function *F =
+        CGM.getIntrinsic(llvm::Intrinsic::alloc_token_id, {IntPtrTy});
+    llvm::CallBase *TokenID = Builder.CreateCall(F, MDV);
+    return RValue::get(TokenID);
+  }
+
   case Builtin::BIbzero:
   case Builtin::BI__builtin_bzero: {
     Address Dest = EmitPointerWithAlignment(E->getArg(0));
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 301d577..01f2161 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2297,9 +2297,13 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
           CGM.getABIInfo().getOptimalVectorMemoryType(VecTy, getLangOpts());
       if (!ClangVecTy->isPackedVectorBoolType(getContext()) &&
           VecTy != NewVecTy) {
-        SmallVector<int, 16> Mask(NewVecTy->getNumElements(), -1);
+        SmallVector<int, 16> Mask(NewVecTy->getNumElements(),
+                                  VecTy->getNumElements());
         std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0);
-        Value = Builder.CreateShuffleVector(Value, Mask, "extractVec");
+        // Use undef instead of poison for the padding lanes, to make sure no
+        // padding bits are poisoned, which may break coercion.
+        Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy),
+                                            Mask, "extractVec");
         SrcTy = NewVecTy;
       }
       if (Addr.getElementType() != SrcTy)
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 384bd59..fbf4a57 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -206,7 +206,7 @@ static Intrinsic::ID getWaveActiveSumIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
-// Return wave active sum that corresponds to the QT scalar type
+// Return wave active max that corresponds to the QT scalar type
 static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
                                                CGHLSLRuntime &RT, QualType QT) {
   switch (Arch) {
@@ -225,6 +225,25 @@ static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
+// Return wave active min that corresponds to the QT scalar type
+static Intrinsic::ID getWaveActiveMinIntrinsic(llvm::Triple::ArchType Arch,
+                                               CGHLSLRuntime &RT, QualType QT) {
+  switch (Arch) {
+  case llvm::Triple::spirv:
+    if (QT->isUnsignedIntegerType())
+      return Intrinsic::spv_wave_reduce_umin;
+    return Intrinsic::spv_wave_reduce_min;
+  case llvm::Triple::dxil: {
+    if (QT->isUnsignedIntegerType())
+      return Intrinsic::dx_wave_reduce_umin;
+    return Intrinsic::dx_wave_reduce_min;
+  }
+  default:
+    llvm_unreachable("Intrinsic WaveActiveMin"
+                     " not supported by target architecture");
+  }
+}
+
 // Returns the mangled name for a builtin function that the SPIR-V backend
 // will expand into a spec Constant.
 static std::string getSpecConstantFunctionName(clang::QualType SpecConstantType,
@@ -742,6 +761,17 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                &CGM.getModule(), IID, {OpExpr->getType()}),
                            ArrayRef{OpExpr}, "hlsl.wave.active.max");
   }
+  case Builtin::BI__builtin_hlsl_wave_active_min: {
+    // Due to the use of variadic arguments, explicitly retreive argument
+    Value *OpExpr = EmitScalarExpr(E->getArg(0));
+    Intrinsic::ID IID = getWaveActiveMinIntrinsic(
+        getTarget().getTriple().getArch(), CGM.getHLSLRuntime(),
+        E->getArg(0)->getType());
+
+    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
+                               &CGM.getModule(), IID, {OpExpr->getType()}),
+                           ArrayRef{OpExpr}, "hlsl.wave.active.min");
+  }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
     // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in
     // defined in SPIRVBuiltins.td. So instead we manually get the matching name
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 31c2f3f..507cc03 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -483,7 +483,8 @@ SanitizerMask Fuchsia::getSupportedSanitizers() const {
   Res |= SanitizerKind::Leak;
   Res |= SanitizerKind::Scudo;
   Res |= SanitizerKind::Thread;
-  if (getTriple().getArch() == llvm::Triple::x86_64) {
+  if (getTriple().getArch() == llvm::Triple::x86_64 ||
+      getTriple().getArch() == llvm::Triple::x86) {
     Res |= SanitizerKind::SafeStack;
   }
   return Res;
@@ -496,6 +497,7 @@ SanitizerMask Fuchsia::getDefaultSanitizers() const {
   case llvm::Triple::riscv64:
     Res |= SanitizerKind::ShadowCallStack;
     break;
+  case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     Res |= SanitizerKind::SafeStack;
     break;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 1d0dfd0b..021d8c6 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2674,8 +2674,11 @@ private:
     }
 
     // *a or &a or &&a.
-    if (PreviousNotConst->is(TT_PointerOrReference))
+    if (PreviousNotConst->is(TT_PointerOrReference) ||
+        PreviousNotConst->endsSequence(tok::coloncolon,
+                                       TT_PointerOrReference)) {
       return true;
+    }
 
     // MyClass a;
     if (PreviousNotConst->isTypeName(LangOpts))
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 5888571..f5add2a 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Locale.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <optional>
 
@@ -662,7 +661,7 @@ void TextDiagnostic::emitDiagnosticMessage(
     FullSourceLoc Loc, PresumedLoc PLoc, DiagnosticsEngine::Level Level,
     StringRef Message, ArrayRef<clang::CharSourceRange> Ranges,
     DiagOrStoredDiag D) {
-  uint64_t StartOfLocationInfo = OS.tell();
+  uint64_t StartOfLocationInfo = OS.getColumn();
 
   // Emit the location of this particular diagnostic.
   if (Loc.isValid())
@@ -675,8 +674,11 @@ void TextDiagnostic::emitDiagnosticMessage(
     printDiagnosticLevel(OS, Level, DiagOpts.ShowColors);
   printDiagnosticMessage(OS,
                          /*IsSupplemental*/ Level == DiagnosticsEngine::Note,
-                         Message, OS.tell() - StartOfLocationInfo,
+                         Message, OS.getColumn() - StartOfLocationInfo,
                          DiagOpts.MessageLength, DiagOpts.ShowColors);
+  // We use a formatted ostream, which does its own buffering. Flush here
+  // so we keep the proper order of output.
+  OS.flush();
 }
 
 /*static*/ void
@@ -1485,7 +1487,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
       if (CharStyle != Styles.end()) {
         if (!CurrentColor ||
             (CurrentColor && *CurrentColor != CharStyle->Color)) {
-          OS.changeColor(CharStyle->Color, false);
+          OS.changeColor(CharStyle->Color);
           CurrentColor = CharStyle->Color;
         }
       } else if (CurrentColor) {
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 0fcfe37..263a107 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -2385,22 +2385,19 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
              (__mmask32) __U);
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_test_epi8_mask (__m128i __A, __m128i __B)
-{
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_test_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B),
                                     _mm_setzero_si128());
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_test_epi8_mask (__m256i __A, __m256i __B)
-{
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
   return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B),
                                   _mm256_setzero_si256());
 }
@@ -2439,9 +2436,8 @@ _mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
                                         _mm256_setzero_si256());
 }
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_testn_epi8_mask (__m128i __A, __m128i __B)
-{
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_testn_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
 }
 
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index d973371..a918af3 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -2598,6 +2598,129 @@ _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
 __attribute__((convergent)) double4 WaveActiveMax(double4);
 
 //===----------------------------------------------------------------------===//
+// WaveActiveMin builtins
+//===----------------------------------------------------------------------===//
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half WaveActiveMin(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half2 WaveActiveMin(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half3 WaveActiveMin(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) half4 WaveActiveMin(half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t WaveActiveMin(int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t2 WaveActiveMin(int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t3 WaveActiveMin(int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int16_t4 WaveActiveMin(int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t WaveActiveMin(uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t2 WaveActiveMin(uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t3 WaveActiveMin(uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint16_t4 WaveActiveMin(uint16_t4);
+#endif
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int WaveActiveMin(int);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int2 WaveActiveMin(int2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int3 WaveActiveMin(int3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int4 WaveActiveMin(int4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint WaveActiveMin(uint);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint2 WaveActiveMin(uint2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint3 WaveActiveMin(uint3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint4 WaveActiveMin(uint4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t WaveActiveMin(int64_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t2 WaveActiveMin(int64_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t3 WaveActiveMin(int64_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) int64_t4 WaveActiveMin(int64_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t WaveActiveMin(uint64_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t2 WaveActiveMin(uint64_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t3 WaveActiveMin(uint64_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) uint64_t4 WaveActiveMin(uint64_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float WaveActiveMin(float);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float2 WaveActiveMin(float2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float3 WaveActiveMin(float3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) float4 WaveActiveMin(float4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double WaveActiveMin(double);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double2 WaveActiveMin(double2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double3 WaveActiveMin(double3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_min)
+__attribute__((convergent)) double4 WaveActiveMin(double4);
+
+//===----------------------------------------------------------------------===//
 // WaveActiveSum builtins
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 96d5142..94a490a 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3279,6 +3279,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_wave_active_max:
+  case Builtin::BI__builtin_hlsl_wave_active_min:
   case Builtin::BI__builtin_hlsl_wave_active_sum: {
     if (SemaRef.checkArgCount(TheCall, 1))
       return true;
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index e9093b2..a90f636 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1856,7 +1856,8 @@ namespace InitParam {
 
 #endif
 
-namespace SAddOverflowInt {
+namespace NonBlockPointerStore {
   int a;
   void foo(void) { a *= __builtin_sadd_overflow(1, 2, 0); }
+  void foo2(void) { a *= __builtin_addc(1, 2, 0, 0); }
 }
diff --git a/clang/test/CIR/CodeGen/try-catch.cpp b/clang/test/CIR/CodeGen/try-catch.cpp
index 1e4d2a6..27e3d8e 100644
--- a/clang/test/CIR/CodeGen/try-catch.cpp
+++ b/clang/test/CIR/CodeGen/try-catch.cpp
@@ -164,3 +164,33 @@ void try_catch_with_alloca() {
 // OGCG: %[[TMP_B:.*]] = load i32, ptr %[[B_ADDR]], align 4
 // OGCG: %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], %[[TMP_B]]
 // OGCG: store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4
+
+void function_with_noexcept() noexcept;
+
+void calling_noexcept_function_inside_try_block() {
+  try {
+    function_with_noexcept();
+  } catch (...) {
+  }
+}
+
+// CIR: cir.scope {
+// CIR:   cir.try {
+// CIR:     cir.call @_Z22function_with_noexceptv() nothrow : () -> ()
+// CIR:     cir.yield
+// CIR:   }
+// CIR: }
+
+// LLVM:   br label %[[LABEL_1:.*]]
+// LLVM: [[LABEL_1]]:
+// LLVM:   br label %[[LABEL_2:.*]]
+// LLVM: [[LABEL_2]]:
+// LLVM:   call void @_Z22function_with_noexceptv()
+// LLVM:   br label %[[LABEL_3:.*]]
+// LLVM: [[LABEL_3]]:
+// LLVM:   br label %[[LABEL_4:.*]]
+// LLVM: [[LABEL_4]]:
+// LLVM:   ret void
+
+// OGCG: call void @_Z22function_with_noexceptv()
+// OGCG: ret void
diff --git a/clang/test/CodeGen/AArch64/ext-vector-coercion.c b/clang/test/CodeGen/AArch64/ext-vector-coercion.c
new file mode 100644
index 0000000..354980a
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/ext-vector-coercion.c
@@ -0,0 +1,42 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fenable-matrix -triple arm64-apple-macosx %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+typedef float float3 __attribute__((ext_vector_type(3)));
+struct Vec3 {
+  union {
+    struct {
+      float x;
+      float y;
+      float z;
+    };
+    float vec __attribute__((ext_vector_type(3)));
+  };
+};
+
+// CHECK-LABEL: define i128 @add(
+// CHECK-SAME: i128 [[A_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_VEC3:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_VEC3]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store i128 [[A_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[LOADVECN1:%.*]] = load <4 x float>, ptr [[TMP1]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x float> [[LOADVECN1]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <3 x float> [[EXTRACTVEC]], [[EXTRACTVEC2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <3 x float> [[ADD]], <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC3]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE4:%.*]] = getelementptr inbounds nuw [[STRUCT_VEC3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[COERCE_DIVE4]], align 16
+// CHECK-NEXT:    ret i128 [[TMP3]]
+//
+struct Vec3 add(struct Vec3 a) {
+  struct Vec3 res;
+  res.vec = a.vec + a.vec;
+  return res;
+}
+
diff --git a/clang/test/CodeGen/AArch64/neon-across.c b/clang/test/CodeGen/AArch64/neon-across.c
index d365975..aa0387d 100644
--- a/clang/test/CodeGen/AArch64/neon-across.c
+++ b/clang/test/CodeGen/AArch64/neon-across.c
@@ -49,7 +49,7 @@ uint32_t test_vaddlv_u16(uint16x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
@@ -60,7 +60,7 @@ int16_t test_vaddlvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
@@ -70,7 +70,7 @@ int32_t test_vaddlvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDLVQ_S32_I]]
@@ -80,7 +80,7 @@ int64_t test_vaddlvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
@@ -91,7 +91,7 @@ uint16_t test_vaddlvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
@@ -101,7 +101,7 @@ uint32_t test_vaddlvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDLVQ_U32_I]]
@@ -155,7 +155,7 @@ uint16_t test_vmaxv_u16(uint16x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
@@ -166,7 +166,7 @@ int8_t test_vmaxvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
@@ -177,7 +177,7 @@ int16_t test_vmaxvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXVQ_S32_I]]
@@ -187,7 +187,7 @@ int32_t test_vmaxvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
@@ -198,7 +198,7 @@ uint8_t test_vmaxvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
@@ -209,7 +209,7 @@ uint16_t test_vmaxvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXVQ_U32_I]]
@@ -263,7 +263,7 @@ uint16_t test_vminv_u16(uint16x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
@@ -274,7 +274,7 @@ int8_t test_vminvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
@@ -285,7 +285,7 @@ int16_t test_vminvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINVQ_S32_I]]
@@ -295,7 +295,7 @@ int32_t test_vminvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
@@ -306,7 +306,7 @@ uint8_t test_vminvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
@@ -317,7 +317,7 @@ uint16_t test_vminvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINVQ_U32_I]]
@@ -371,7 +371,7 @@ uint16_t test_vaddv_u16(uint16x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
@@ -382,7 +382,7 @@ int8_t test_vaddvq_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
@@ -393,7 +393,7 @@ int16_t test_vaddvq_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDVQ_S32_I]]
@@ -403,7 +403,7 @@ int32_t test_vaddvq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
@@ -414,7 +414,7 @@ uint8_t test_vaddvq_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
@@ -425,7 +425,7 @@ uint16_t test_vaddvq_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDVQ_U32_I]]
@@ -435,7 +435,7 @@ uint32_t test_vaddvq_u32(uint32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMAXVQ_F32_I]]
@@ -445,7 +445,7 @@ float32_t test_vmaxvq_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMINVQ_F32_I]]
@@ -455,7 +455,7 @@ float32_t test_vminvq_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMAXNMVQ_F32_I]]
@@ -465,7 +465,7 @@ float32_t test_vmaxnmvq_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMINNMVQ_F32_I]]
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 116d86f..febef46 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -645,6 +645,21 @@ __mmask16 test_mm_cmp_epi8_mask(__m128i __a, __m128i __b) {
   return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 0);
 }
 
+TEST_CONSTEXPR(_mm_cmpeq_epi8_mask(
+    ((__m128i)(__v16qi){5, 3, 7, 2, 9, 3, 7, 1, 5, 4, 8, 2, 9, 6, 7, 5}),
+    ((__m128i)(__v16qi){5, 2, 7, 3, 9, 4, 6, 1, 5, 3, 8, 1, 9, 5, 7, 5})
+) == (__mmask16)0xd595);
+
+TEST_CONSTEXPR(_mm_cmplt_epi8_mask(
+    ((__m128i)(__v16qi){1, 5, 3, 7, 2, 8, 4, 6, 9, 5, 3, 11, 2, 6, 15, 8}),
+    ((__m128i)(__v16qi){2, 4, 6, 8, 3, 5, 7, 9, 4, 6, 8, 10, 5, 7, 9, 11})
+) == (__mmask16)0xb6dd);
+
+TEST_CONSTEXPR(_mm_cmple_epi8_mask(
+    ((__m128i)(__v16qi){1, 3, 5, 7, 2, 6, 6, 8, 1, 3, 9, 7, 2, 4, 6, 10}),
+    ((__m128i)(__v16qi){2, 3, 4, 7, 3, 4, 5, 8, 2, 3, 4, 7, 3, 4, 5, 8})
+) == (__mmask16)0x3b9b);
+
 __mmask16 test_mm_mask_cmp_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: test_mm_mask_cmp_epi8_mask
   // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
@@ -2894,6 +2909,12 @@ __mmask16 test_mm_test_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_test_epi8_mask(__A, __B); 
 }
 
+TEST_CONSTEXPR(_mm_test_epi8_mask(
+    (__m128i)(__v16qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    (__m128i)(__v16qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+)
+== (__mmask16)0xfffb);
+
 __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_test_epi8_mask
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
@@ -2901,6 +2922,12 @@ __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return _mm_mask_test_epi8_mask(__U, __A, __B); 
 }
+TEST_CONSTEXPR(_mm_mask_test_epi8_mask(
+    0xFFFF,
+    (__m128i)(__v16qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    (__m128i)(__v16qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+)
+== (__mmask16)0xfffb);
 
 __mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_test_epi8_mask
@@ -2908,6 +2935,11 @@ __mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
   // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
   return _mm256_test_epi8_mask(__A, __B); 
 }
+TEST_CONSTEXPR(_mm256_test_epi8_mask(
+    (__m256i)(__v32qi){1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    (__m256i)(__v32qi){1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+)
+== (__mmask32)0xfffbfffb);
 
 __mmask32 test_mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_test_epi8_mask
@@ -2954,6 +2986,12 @@ __mmask16 test_mm_testn_epi8_mask(__m128i __A, __m128i __B) {
   return _mm_testn_epi8_mask(__A, __B); 
 }
 
+TEST_CONSTEXPR(_mm_testn_epi8_mask(
+    (__m128i)(__v16qi){1, 2, 77, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 16, 16},
+    (__m128i)(__v16qi){2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15}
+)
+== (__mmask16)0xe001);
+
 __mmask16 test_mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_testn_epi8_mask
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
diff --git a/clang/test/CodeGen/lto-newpm-pipeline.c b/clang/test/CodeGen/lto-newpm-pipeline.c
index ea9784a..dceaaf1 100644
--- a/clang/test/CodeGen/lto-newpm-pipeline.c
+++ b/clang/test/CodeGen/lto-newpm-pipeline.c
@@ -32,10 +32,12 @@
 // CHECK-FULL-O0-NEXT: Running pass: AlwaysInlinerPass
 // CHECK-FULL-O0-NEXT: Running analysis: ProfileSummaryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: CoroConditionalWrapper
+// CHECK-FULL-O0-NEXT: Running pass: AllocTokenPass
+// CHECK-FULL-O0-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+// CHECK-FULL-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: CanonicalizeAliasesPass
 // CHECK-FULL-O0-NEXT: Running pass: NameAnonGlobalPass
 // CHECK-FULL-O0-NEXT: Running pass: AnnotationRemarksPass
-// CHECK-FULL-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-FULL-O0-NEXT: Running pass: VerifierPass
 // CHECK-FULL-O0-NEXT: Running pass: BitcodeWriterPass
 
@@ -46,10 +48,12 @@
 // CHECK-THIN-O0-NEXT: Running pass: AlwaysInlinerPass
 // CHECK-THIN-O0-NEXT: Running analysis: ProfileSummaryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: CoroConditionalWrapper
+// CHECK-THIN-O0-NEXT: Running pass: AllocTokenPass
+// CHECK-THIN-O0-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+// CHECK-THIN-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: CanonicalizeAliasesPass
 // CHECK-THIN-O0-NEXT: Running pass: NameAnonGlobalPass
 // CHECK-THIN-O0-NEXT: Running pass: AnnotationRemarksPass
-// CHECK-THIN-O0-NEXT: Running analysis: TargetLibraryAnalysis
 // CHECK-THIN-O0-NEXT: Running pass: VerifierPass
 // CHECK-THIN-O0-NEXT: Running pass: ThinLTOBitcodeWriterPass
 
diff --git a/clang/test/CodeGenCXX/alloc-token-builtin.cpp b/clang/test/CodeGenCXX/alloc-token-builtin.cpp
new file mode 100644
index 0000000..adadf7b
--- /dev/null
+++ b/clang/test/CodeGenCXX/alloc-token-builtin.cpp
@@ -0,0 +1,97 @@
+// To test IR generation of the builtin without evaluating the LLVM intrinsic,
+// we set the mode to a stateful mode, which prohibits constant evaluation.
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -Werror -std=c++20 -emit-llvm -falloc-token-mode=random -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-CODEGEN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -Werror -std=c++20 -emit-llvm -falloc-token-max=2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LOWER
+
+extern "C" void *my_malloc(unsigned long, unsigned long);
+
+struct NoPtr {
+  int x;
+  long y;
+};
+
+struct WithPtr {
+  int a;
+  char *buf;
+};
+
+int unevaluated_fn();
+
+// CHECK-LABEL: @_Z16test_builtin_intv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_int() {
+  return __builtin_infer_alloc_token(sizeof(1));
+}
+
+// CHECK-LABEL: @_Z16test_builtin_ptrv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_PTR:[0-9]+]])
+// CHECK-LOWER: ret i64 1
+unsigned long test_builtin_ptr() {
+  return __builtin_infer_alloc_token(sizeof(int *));
+}
+
+// CHECK-LABEL: @_Z25test_builtin_struct_noptrv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_NOPTR:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_struct_noptr() {
+  return __builtin_infer_alloc_token(sizeof(NoPtr));
+}
+
+// CHECK-LABEL: @_Z25test_builtin_struct_w_ptrv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_WITHPTR:[0-9]+]])
+// CHECK-LOWER: ret i64 1
+unsigned long test_builtin_struct_w_ptr() {
+  return __builtin_infer_alloc_token(sizeof(WithPtr), 123);
+}
+
+// CHECK-LABEL: @_Z24test_builtin_unevaluatedv(
+// CHECK-NOT: call{{.*}}unevaluated_fn
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_unevaluated() {
+	return __builtin_infer_alloc_token(sizeof(int) * unevaluated_fn());
+}
+
+// CHECK-LABEL: @_Z36test_builtin_unsequenced_unevaluatedi(
+// CHECK:     add nsw
+// CHECK-NOT: add nsw
+// CHECK-CODEGEN: %[[REG:[0-9]+]] = call i64 @llvm.alloc.token.id.i64(metadata ![[META_UNKNOWN:[0-9]+]])
+// CHECK-CODEGEN: call{{.*}}@my_malloc({{.*}}, i64 noundef %[[REG]])
+// CHECK-LOWER: call{{.*}}@my_malloc({{.*}}, i64 noundef 0)
+void test_builtin_unsequenced_unevaluated(int x) {
+  my_malloc(++x, __builtin_infer_alloc_token(++x));
+}
+
+// CHECK-LABEL: @_Z20test_builtin_unknownv(
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_UNKNOWN:[0-9]+]])
+// CHECK-LOWER: ret i64 0
+unsigned long test_builtin_unknown() {
+  return __builtin_infer_alloc_token(4096);
+}
+
+// Test template instantiation.
+template <typename T>
+constexpr unsigned long get_token() {
+  return __builtin_infer_alloc_token(sizeof(T));
+}
+
+// CHECK-LABEL: @_Z13get_token_intv()
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_INT]])
+// CHECK-LOWER: ret i64 0
+unsigned long get_token_int() {
+  return get_token<int>();
+}
+
+// CHECK-LABEL: @_Z13get_token_ptrv()
+// CHECK-CODEGEN: call i64 @llvm.alloc.token.id.i64(metadata ![[META_PTR]])
+// CHECK-LOWER: ret i64 1
+unsigned long get_token_ptr() {
+  return get_token<int *>();
+}
+
+// CHECK-CODEGEN: ![[META_INT]] = !{!"int", i1 false}
+// CHECK-CODEGEN: ![[META_PTR]] = !{!"int *", i1 true}
+// CHECK-CODEGEN: ![[META_NOPTR]] = !{!"NoPtr", i1 false}
+// CHECK-CODEGEN: ![[META_WITHPTR]] = !{!"WithPtr", i1 true}
+// CHECK-CODEGEN: ![[META_UNKNOWN]] = !{}
diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
index 2e7531b..4be1cb3 100644
--- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
+++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -19,7 +19,7 @@ using i4x3x3 = _BitInt(4) __attribute__((matrix_type(3, 3)));
 // CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4
 // CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
@@ -38,7 +38,7 @@ i8x3 v1(i8x3 a) {
 // CHECK-SAME: <3 x i32> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
 // CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
@@ -57,7 +57,7 @@ i32x3 v2(i32x3 a) {
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
 // CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
 // CHECK-NEXT:    [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
 // CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
@@ -80,7 +80,7 @@ i512x3 v3(i512x3 a) {
 // CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
 // CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 4
 // CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i4> [[LOADVECN]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i4> [[LOADVECN2]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl
new file mode 100644
index 0000000..1194f84
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test_int
+int test_int(int expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.min.i32([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.min.i32([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveMin(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.min.i32([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.min.i32([[TY]]) #[[#attr:]]
+
+// CHECK-LABEL: test_uint64_t
+uint64_t test_uint64_t(uint64_t expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umin.i64([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umin.i64([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveMin(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.umin.i64([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.umin.i64([[TY]]) #[[#attr:]]
+
+// Test basic lowering to runtime function call with array and float value.
+
+// CHECK-LABEL: test_floatv4
+float4 test_floatv4(float4 expr) {
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.min.v4f32([[TY1]] %[[#]]
+  // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.min.v4f32([[TY1]] %[[#]])
+  // CHECK:  ret [[TY1]] %[[RET1]]
+  return WaveActiveMin(expr);
+}
+
+// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.reduce.min.v4f32([[TY1]]) #[[#attr]]
+// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.reduce.min.v4f32([[TY1]]) #[[#attr]]
+
+// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
+
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e76aa81..0017169 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -12,7 +12,7 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 // CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META10:![0-9]+]] !kernel_arg_type_qual [[META11:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT:    [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> <float undef, float poison, float poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -24,7 +24,7 @@ void kernel foo(global float3 *a, global float3 *b) {
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12]]
-// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> <float undef, float poison, float poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 16, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
@@ -60,7 +60,7 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 // CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META18:![0-9]+]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[CHAR_TBAA12]]
-// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> <i16 undef, i16 poison, i16 poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 8, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
@@ -71,8 +71,8 @@ void kernel char8_to_short3(global short3 *a, global char8 *b) {
 // CHECK-LABEL: define dso_local spir_func void @from_char3(
 // CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_char3(char3 a, global int *out) {
@@ -82,8 +82,8 @@ void from_char3(char3 a, global int *out) {
 // CHECK-LABEL: define dso_local spir_func void @from_short3(
 // CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA19:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA19:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_short3(short3 a, global long *out) {
@@ -94,7 +94,8 @@ void from_short3(short3 a, global long *out) {
 // CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[ASTYPE]], <3 x i8> <i8 undef, i8 poison, i8 poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
@@ -106,7 +107,8 @@ void scalar_to_char3(int a, global char3 *out) {
 // CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[ASTYPE]], <3 x i16> <i16 undef, i16 poison, i16 poison>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/include/clang/Driver/aarch64-mlr-for-calls-only.c b/clang/test/Driver/aarch64-mlr-for-calls-only.c
index e71a4cd..e71a4cd 100644
--- a/clang/include/clang/Driver/aarch64-mlr-for-calls-only.c
+++ b/clang/test/Driver/aarch64-mlr-for-calls-only.c
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index d0fec18..99e5018 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -130,6 +130,11 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
 // RUN:     -fuse-ld=ld \
 // RUN:     | FileCheck %s -check-prefix=CHECK-SAFESTACK
+// RUN: %clang -### %s --target=x86_64-unknown-fuchsia -m32 \
+// RUN:     -fsanitize=safe-stack 2>&1 \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     -fuse-ld=ld \
+// RUN:     | FileCheck %s -check-prefix=CHECK-SAFESTACK
 // CHECK-SAFESTACK: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SAFESTACK: "-fsanitize=safe-stack"
 // CHECK-SAFESTACK-NOT: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-fuchsia{{/|\\\\}}libclang_rt.safestack.a"
diff --git a/clang/test/Frontend/diag-wrap-colors.cpp b/clang/test/Frontend/diag-wrap-colors.cpp
new file mode 100644
index 0000000..e3dccb1
--- /dev/null
+++ b/clang/test/Frontend/diag-wrap-colors.cpp
@@ -0,0 +1,6 @@
+// RUN: not %clang_cc1 %s -fmessage-length=50 -fcolor-diagnostics -fno-show-source-location -o - 2>&1 | FileCheck %s
+
+struct F {
+  float a : 10;
+};
+// CHECK: bit-field 'a' has non-integral type 'float'
diff --git a/clang/test/Preprocessor/bpf-predefined-macros.c b/clang/test/Preprocessor/bpf-predefined-macros.c
index cd8a2ec..a9ae8c5 100644
--- a/clang/test/Preprocessor/bpf-predefined-macros.c
+++ b/clang/test/Preprocessor/bpf-predefined-macros.c
@@ -70,6 +70,9 @@ int u;
 #ifdef __BPF_FEATURE_LOAD_ACQ_STORE_REL
 int v;
 #endif
+#ifdef __BPF_FEATURE_GOTOX
+int w;
+#endif
 
 // CHECK: int b;
 // CHECK: int c;
@@ -110,6 +113,7 @@ int v;
 // CPU_V4: int u;
 
 // CPU_V4: int v;
+// CPU_V4: int w;
 
 // CPU_GENERIC: int g;
 
diff --git a/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl
new file mode 100644
index 0000000..3b12faf
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/WaveActiveMin.hlsl
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg() {
+  return __builtin_hlsl_wave_active_min();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_wave_active_min(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool test_expr_bool_type_check(bool p0) {
+  return __builtin_hlsl_wave_active_min(p0);
+  // expected-error@-1 {{invalid operand of type 'bool'}}
+}
+
+bool2 test_expr_bool_vec_type_check(bool2 p0) {
+  return __builtin_hlsl_wave_active_min(p0);
+  // expected-error@-1 {{invalid operand of type 'bool2' (aka 'vector<bool, 2>')}}
+}
+
+struct S { float f; };
+
+S test_expr_struct_type_check(S p0) {
+  return __builtin_hlsl_wave_active_min(p0);
+  // expected-error@-1 {{invalid operand of type 'S' where a scalar or vector is required}}
+}
+
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index ca99940..c046142 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1119,6 +1119,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) {
   EXPECT_TOKEN(Tokens[8], tok::amp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[12], tok::amp, TT_PointerOrReference);
 
+  Tokens = annotate("::foo::bar& ::foo::bar::operator=(::foo::bar& other);");
+  ASSERT_EQ(Tokens.size(), 22u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[17], tok::amp, TT_PointerOrReference);
+
   Tokens = annotate("SomeLoooooooooooooooooType::Awaitable\n"
                     "SomeLoooooooooooooooooType::operator co_await();");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
@@ -3484,6 +3489,10 @@ TEST_F(TokenAnnotatorTest, StartOfName) {
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown); // Not StartOfName
 
+  Tokens = annotate("int* ::foo::bar;");
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_StartOfName);
+
   auto Style = getLLVMStyle();
   Style.StatementAttributeLikeMacros.push_back("emit");
   Tokens = annotate("emit foo = 0;", Style);
diff --git a/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
index 73b5967..d121292 100644
--- a/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
+++ b/compiler-rt/lib/nsan/tests/NSanUnitTest.cpp
@@ -43,8 +43,8 @@ template <typename FT, auto next> void TestFT() {
   ASSERT_EQ(GetULPDiff<FT>(-X, -Y), 3);
 
   // Values with larger differences.
-  static constexpr const __uint128_t MantissaSize =
-      __uint128_t{1} << FTInfo<FT>::kMantissaBits;
+  static constexpr const __sanitizer::u64 MantissaSize =
+      __sanitizer::u64{1} << FTInfo<FT>::kMantissaBits;
   ASSERT_EQ(GetULPDiff<FT>(1.0, next(2.0, 1.0)), MantissaSize - 1);
   ASSERT_EQ(GetULPDiff<FT>(1.0, 2.0), MantissaSize);
   ASSERT_EQ(GetULPDiff<FT>(1.0, next(2.0, 3.0)), MantissaSize + 1);
@@ -57,6 +57,11 @@ TEST(NSanTest, Double) {
   TestFT<double, static_cast<double (*)(double, double)>(nextafter)>();
 }
 
-TEST(NSanTest, Float128) { TestFT<__float128, nextafterf128>(); }
+TEST(NSanTest, Float128) {
+  // Very basic tests. FIXME: improve when we have nextafter<__float128>.
+  ASSERT_EQ(GetULPDiff<__float128>(0.0, 0.0), 0);
+  ASSERT_EQ(GetULPDiff<__float128>(-0.0, 0.0), 0);
+  ASSERT_NE(GetULPDiff<__float128>(-0.01, 0.01), kMaxULPDiff);
+}
 
 } // end namespace __nsan
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
index 62ab055..7fa5e01 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
@@ -259,7 +259,9 @@ void InitializePlatform() {
 
   ThreadEventCallbacks callbacks = {
       .create = ThreadCreateCallback,
+      .start = nullptr,
       .terminate = ThreadTerminateCallback,
+      .destroy = nullptr,
   };
   InstallPthreadIntrospectionHook(callbacks);
 #endif
diff --git a/compiler-rt/test/fuzzer/fuzzer-ubsan.test b/compiler-rt/test/fuzzer/fuzzer-ubsan.test
index d22339d..6bc2c38 100644
--- a/compiler-rt/test/fuzzer/fuzzer-ubsan.test
+++ b/compiler-rt/test/fuzzer/fuzzer-ubsan.test
@@ -1,6 +1,3 @@
-// This test currently fails to compile on green.lab.llvm.org (arm)
-// XFAIL: system-darwin && target=arm{{.*}}
-
 RUN: %cpp_compiler -fsanitize=undefined -fno-sanitize-recover=all %S/SignedIntOverflowTest.cpp -o %t-SignedIntOverflowTest-Ubsan
 RUN: not %run %t-SignedIntOverflowTest-Ubsan 2>&1 | FileCheck %s
 CHECK: runtime error: signed integer overflow: 2147483647 + 1 cannot be represented in type 'int'
diff --git a/compiler-rt/test/fuzzer/reduce_inputs.test b/compiler-rt/test/fuzzer/reduce_inputs.test
index e65f572..d296fa4 100644
--- a/compiler-rt/test/fuzzer/reduce_inputs.test
+++ b/compiler-rt/test/fuzzer/reduce_inputs.test
@@ -12,5 +12,5 @@ RUN: %run %t-ShrinkControlFlowSimpleTest -runs=0 %t/C 2>&1 | FileCheck %s --chec
 COUNT: seed corpus: files: 4
 
 # a bit longer test
-RUN: %run %t-ShrinkControlFlowTest  -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60  -seed=42 -runs=1000000  2>&1 | FileCheck %s
+RUN: %run %t-ShrinkControlFlowTest  -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60  -seed=42 -runs=10000000  2>&1 | FileCheck %s
 
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index c3cd119b..3407dd0 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -211,6 +211,8 @@ struct IntrinsicLibrary {
   mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genBesselJn(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genBesselYn(mlir::Type,
@@ -459,7 +461,21 @@ struct IntrinsicLibrary {
   mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
   void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genTransfer(mlir::Type,
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 553cbd5..bb97069 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -599,7 +599,7 @@ public:
   NODE(parser, OmpInitClause)
   NODE(OmpInitClause, Modifier)
   NODE(parser, OmpInitializerClause)
-  NODE(parser, OmpInitializerProc)
+  NODE(parser, OmpInitializerExpression)
   NODE(parser, OmpInReductionClause)
   NODE(OmpInReductionClause, Modifier)
   NODE(parser, OmpInteropPreference)
@@ -677,6 +677,10 @@ public:
   NODE_ENUM(OmpSeverityClause, Severity)
   NODE(parser, OmpStepComplexModifier)
   NODE(parser, OmpStepSimpleModifier)
+  NODE(parser, OmpStylizedDeclaration)
+  NODE(parser, OmpStylizedExpression)
+  NODE(parser, OmpStylizedInstance)
+  NODE(OmpStylizedInstance, Instance)
   NODE(parser, OmpTaskDependenceType)
   NODE_ENUM(OmpTaskDependenceType, Value)
   NODE(parser, OmpTaskReductionClause)
diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index f761332..49db091 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -25,6 +25,13 @@
 
 namespace Fortran::parser::omp {
 
+template <typename T> constexpr auto addr_if(std::optional<T> &x) {
+  return x ? &*x : nullptr;
+}
+template <typename T> constexpr auto addr_if(const std::optional<T> &x) {
+  return x ? &*x : nullptr;
+}
+
 namespace detail {
 using D = llvm::omp::Directive;
 
@@ -133,9 +140,24 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) {
 }
 
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause);
+
+template <typename T>
+const T *GetFirstArgument(const OmpDirectiveSpecification &spec) {
+  for (const OmpArgument &arg : spec.Arguments().v) {
+    if (auto *t{std::get_if<T>(&arg.u)}) {
+      return t;
+    }
+  }
+  return nullptr;
+}
+
 const BlockConstruct *GetFortranBlockConstruct(
     const ExecutionPartConstruct &epc);
 
+const OmpCombinerExpression *GetCombinerExpr(
+    const OmpReductionSpecifier &rspec);
+const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init);
+
 } // namespace Fortran::parser::omp
 
 #endif // FORTRAN_PARSER_OPENMP_UTILS_H
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 2cf6fae..c3a8c2e 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -24,7 +24,9 @@
 #include "provenance.h"
 #include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
+#include "flang/Common/reference.h"
 #include "flang/Support/Fortran.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Frontend/OpenACC/ACC.h.inc"
 #include "llvm/Frontend/OpenMP/OMP.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
@@ -3510,6 +3512,8 @@ struct OmpDirectiveName {
 
 // type-name list item
 struct OmpTypeName {
+  CharBlock source;
+  mutable const semantics::DeclTypeSpec *declTypeSpec{nullptr};
   UNION_CLASS_BOILERPLATE(OmpTypeName);
   std::variant<TypeSpec, DeclarationTypeSpec> u;
 };
@@ -3538,6 +3542,39 @@ struct OmpObjectList {
   WRAPPER_CLASS_BOILERPLATE(OmpObjectList, std::list<OmpObject>);
 };
 
+struct OmpStylizedDeclaration {
+  COPY_AND_ASSIGN_BOILERPLATE(OmpStylizedDeclaration);
+  // Since "Reference" isn't handled by parse-tree-visitor, add EmptyTrait,
+  // and visit the members by hand when needed.
+  using EmptyTrait = std::true_type;
+  common::Reference<const OmpTypeName> type;
+  EntityDecl var;
+};
+
+struct OmpStylizedInstance {
+  struct Instance {
+    UNION_CLASS_BOILERPLATE(Instance);
+    std::variant<AssignmentStmt, CallStmt, common::Indirection<Expr>> u;
+  };
+  TUPLE_CLASS_BOILERPLATE(OmpStylizedInstance);
+  std::tuple<std::list<OmpStylizedDeclaration>, Instance> t;
+};
+
+class ParseState;
+
+// Ref: [5.2:76], [6.0:185]
+//
+struct OmpStylizedExpression {
+  CharBlock source;
+  // Pointer to a temporary copy of the ParseState that is used to create
+  // additional parse subtrees for the stylized expression. This is only
+  // used internally during parsing and conveys no information to the
+  // consumers of the AST.
+  const ParseState *state{nullptr};
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpStylizedExpression, std::list<OmpStylizedInstance>);
+};
+
 // Ref: [4.5:201-207], [5.0:293-299], [5.1:325-331], [5.2:124]
 //
 // reduction-identifier ->
@@ -3555,9 +3592,22 @@ struct OmpReductionIdentifier {
 // combiner-expression ->                           // since 4.5
 //    assignment-statement |
 //    function-reference
-struct OmpCombinerExpression {
-  UNION_CLASS_BOILERPLATE(OmpCombinerExpression);
-  std::variant<AssignmentStmt, FunctionReference> u;
+struct OmpCombinerExpression : public OmpStylizedExpression {
+  INHERITED_WRAPPER_CLASS_BOILERPLATE(
+      OmpCombinerExpression, OmpStylizedExpression);
+  static llvm::ArrayRef<CharBlock> Variables();
+};
+
+// Ref: [4.5:222:7-8], [5.0:305:28-29], [5.1:337:20-21], [5.2:127:6-8],
+//      [6.0:242:3-5]
+//
+// initializer-expression ->                        // since 4.5
+//    OMP_PRIV = expression |
+//    subroutine-name(argument-list)
+struct OmpInitializerExpression : public OmpStylizedExpression {
+  INHERITED_WRAPPER_CLASS_BOILERPLATE(
+      OmpInitializerExpression, OmpStylizedExpression);
+  static llvm::ArrayRef<CharBlock> Variables();
 };
 
 inline namespace arguments {
@@ -4558,16 +4608,9 @@ struct OmpInReductionClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
-// declare-reduction -> DECLARE REDUCTION (reduction-identifier : type-list
-//                                              : combiner) [initializer-clause]
-struct OmpInitializerProc {
-  TUPLE_CLASS_BOILERPLATE(OmpInitializerProc);
-  std::tuple<ProcedureDesignator, std::list<ActualArgSpec>> t;
-};
 // Initialization for declare reduction construct
 struct OmpInitializerClause {
-  UNION_CLASS_BOILERPLATE(OmpInitializerClause);
-  std::variant<OmpInitializerProc, AssignmentStmt> u;
+  WRAPPER_CLASS_BOILERPLATE(OmpInitializerClause, OmpInitializerExpression);
 };
 
 // Ref: [4.5:199-201], [5.0:288-290], [5.1:321-322], [5.2:115-117]
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 04a0639..cb27d544 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -830,6 +830,8 @@ public:
       OmpUseDevicePtr, OmpUseDeviceAddr, OmpIsDevicePtr, OmpHasDeviceAddr,
       // OpenMP data-copying attribute
       OmpCopyIn, OmpCopyPrivate,
+      // OpenMP special variables
+      OmpInVar, OmpOrigVar, OmpOutVar, OmpPrivVar,
       // OpenMP miscellaneous flags
       OmpCommonBlock, OmpReduction, OmpInReduction, OmpAligned, OmpNontemporal,
       OmpAllocate, OmpDeclarativeAllocateDirective,
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 39bac81..53fe9c0 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -50,6 +50,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -358,6 +359,14 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genBarrierInit,
      {{{"barrier", asAddr}, {"count", asValue}}},
      /*isElemental=*/false},
+    {"barrier_try_wait",
+     &I::genBarrierTryWait,
+     {{{"barrier", asAddr}, {"token", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait_sleep",
+     &I::genBarrierTryWaitSleep,
+     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
+     /*isElemental=*/false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -1036,10 +1045,87 @@ static constexpr IntrinsicHandler handlers[]{
        {"dst", asAddr},
        {"nbytes", asValue}}},
      /*isElemental=*/false},
+    {"tma_bulk_ldc4",
+     &I::genTMABulkLoadC4,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc8",
+     &I::genTMABulkLoadC8,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi4",
+     &I::genTMABulkLoadI4,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi8",
+     &I::genTMABulkLoadI8,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr2",
+     &I::genTMABulkLoadR2,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr4",
+     &I::genTMABulkLoadR4,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr8",
+     &I::genTMABulkLoadR8,
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
     {"tma_bulk_s2g",
      &I::genTMABulkS2G,
      {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
      /*isElemental=*/false},
+    {"tma_bulk_store_c4",
+     &I::genTMABulkStoreC4,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c8",
+     &I::genTMABulkStoreC8,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i4",
+     &I::genTMABulkStoreI4,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i8",
+     &I::genTMABulkStoreI8,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r2",
+     &I::genTMABulkStoreR2,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r4",
+     &I::genTMABulkStoreR4,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r8",
+     &I::genTMABulkStoreR8,
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
     {"tma_bulk_wait_group",
      &I::genTMABulkWaitGroup,
      {{}},
@@ -3282,6 +3368,57 @@ void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
 }
 
+// BARRIER_TRY_WAIT (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
+                                    llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
+  fir::StoreOp::create(builder, loc, zero, res);
+  mlir::Value ns =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
+  mlir::Value load = fir::LoadOp::create(builder, loc, res);
+  auto whileOp = mlir::scf::WhileOp::create(
+      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
+  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
+  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(beforeBlock);
+  mlir::Value condition = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
+  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
+  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
+  afterBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(afterBlock);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  mlir::Value ret =
+      mlir::NVVM::InlinePtxOp::create(
+          builder, loc, {resultType}, {barrier, args[1], ns}, {},
+          ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
+          "selp.b32 %0, 1, 0, p;",
+          {})
+          .getResult(0);
+  mlir::scf::YieldOp::create(builder, loc, ret);
+  builder.setInsertionPointAfter(whileOp);
+  return whileOp.getResult(0);
+}
+
+// BARRIER_TRY_WAIT_SLEEP (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  return mlir::NVVM::InlinePtxOp::create(
+             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
+             ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
+             "selp.b32 %0, 1, 0, p;",
+             {})
+      .getResult(0);
+}
+
 // BESSEL_JN
 fir::ExtendedValue
 IntrinsicLibrary::genBesselJn(mlir::Type resultType,
@@ -9218,6 +9355,93 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
       builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
 }
 
+static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value barrier, mlir::Value src,
+                           mlir::Value dst, mlir::Value nelem,
+                           mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
+      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
+      "[%1], %2, [%3];",
+      {});
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
+      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
+}
+
+// TMA_BULK_LOADC4
+void IntrinsicLibrary::genTMABulkLoadC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADC8
+void IntrinsicLibrary::genTMABulkLoadC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI4
+void IntrinsicLibrary::genTMABulkLoadI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI8
+void IntrinsicLibrary::genTMABulkLoadI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR2
+void IntrinsicLibrary::genTMABulkLoadR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR4
+void IntrinsicLibrary::genTMABulkLoadR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR8
+void IntrinsicLibrary::genTMABulkLoadR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
 // TMA_BULK_S2G (CUDA)
 void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 3);
@@ -9227,6 +9451,97 @@ void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
                                           mlir::NVVM::NVVMMemorySpace::Global);
   mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
       builder, loc, dst, src, fir::getBase(args[2]), {}, {});
+
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value src, mlir::Value dst, mlir::Value count,
+                            mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
+  src = convertPtrToNVVMSpace(builder, loc, src,
+                              mlir::NVVM::NVVMMemorySpace::Shared);
+  dst = convertPtrToNVVMSpace(builder, loc, dst,
+                              mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
+                                                     size, {}, {});
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+// TMA_BULK_STORE_C4 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_C8 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I4 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I8 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R2 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R4 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R8 (CUDA)
+void IntrinsicLibrary::genTMABulkStoreR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
 }
 
 // TMA_BULK_WAIT_GROUP (CUDA)
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index 0776346..8ca2869 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -143,7 +143,8 @@ public:
         llvm::SmallVector<mlir::Type> operandsTypes;
         for (auto arg : gpuLaunchFunc.getKernelOperands())
           operandsTypes.push_back(arg.getType());
-        auto fctTy = mlir::FunctionType::get(&context, operandsTypes, {});
+        auto fctTy = mlir::FunctionType::get(&context, operandsTypes,
+                                             gpuLaunchFunc.getResultTypes());
         if (!hasPortableSignature(fctTy, op))
           convertCallOp(gpuLaunchFunc, fctTy);
       } else if (auto addr = mlir::dyn_cast<fir::AddrOfOp>(op)) {
@@ -520,10 +521,14 @@ public:
     llvm::SmallVector<mlir::Value, 1> newCallResults;
     // TODO propagate/update call argument and result attributes.
     if constexpr (std::is_same_v<std::decay_t<A>, mlir::gpu::LaunchFuncOp>) {
+      mlir::Value asyncToken = callOp.getAsyncToken();
       auto newCall = A::create(*rewriter, loc, callOp.getKernel(),
                                callOp.getGridSizeOperandValues(),
                                callOp.getBlockSizeOperandValues(),
-                               callOp.getDynamicSharedMemorySize(), newOpers);
+                               callOp.getDynamicSharedMemorySize(), newOpers,
+                               asyncToken ? asyncToken.getType() : nullptr,
+                               callOp.getAsyncDependencies(),
+                               /*clusterSize=*/std::nullopt);
       if (callOp.getClusterSizeX())
         newCall.getClusterSizeXMutable().assign(callOp.getClusterSizeX());
       if (callOp.getClusterSizeY())
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index d0164f3..4f97aca 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4484,7 +4484,7 @@ void fir::IfOp::getSuccessorRegions(
     llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(mlir::RegionSuccessor(getResults()));
+    regions.push_back(mlir::RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -4494,7 +4494,8 @@ void fir::IfOp::getSuccessorRegions(
   // Don't consider the else region if it is empty.
   mlir::Region *elseRegion = &this->getElseRegion();
   if (elseRegion->empty())
-    regions.push_back(mlir::RegionSuccessor());
+    regions.push_back(
+        mlir::RegionSuccessor(getOperation(), getOperation()->getResults()));
   else
     regions.push_back(mlir::RegionSuccessor(elseRegion));
 }
@@ -4513,7 +4514,7 @@ void fir::IfOp::getEntrySuccessorRegions(
     if (!getElseRegion().empty())
       regions.emplace_back(&getElseRegion());
     else
-      regions.emplace_back(getResults());
+      regions.emplace_back(getOperation(), getOperation()->getResults());
   }
 }
 
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index d1e081c..4159d2e 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -275,6 +275,13 @@ struct SpecificModifierParser {
 
 // --- Iterator helpers -----------------------------------------------
 
+static EntityDecl MakeEntityDecl(ObjectName &&name) {
+  return EntityDecl(
+      /*ObjectName=*/std::move(name), std::optional<ArraySpec>{},
+      std::optional<CoarraySpec>{}, std::optional<CharLength>{},
+      std::optional<Initialization>{});
+}
+
 // [5.0:47:17-18] In an iterator-specifier, if the iterator-type is not
 // specified then the type of that iterator is default integer.
 // [5.0:49:14] The iterator-type must be an integer type.
@@ -282,11 +289,7 @@ static std::list<EntityDecl> makeEntityList(std::list<ObjectName> &&names) {
   std::list<EntityDecl> entities;
 
   for (auto iter = names.begin(), end = names.end(); iter != end; ++iter) {
-    EntityDecl entityDecl(
-        /*ObjectName=*/std::move(*iter), std::optional<ArraySpec>{},
-        std::optional<CoarraySpec>{}, std::optional<CharLength>{},
-        std::optional<Initialization>{});
-    entities.push_back(std::move(entityDecl));
+    entities.push_back(MakeEntityDecl(std::move(*iter)));
   }
   return entities;
 }
@@ -306,6 +309,217 @@ static TypeDeclarationStmt makeIterSpecDecl(std::list<ObjectName> &&names) {
       makeEntityList(std::move(names)));
 }
 
+// --- Stylized expression handling -----------------------------------
+
+// OpenMP has a concept of am "OpenMP stylized expression". Syntactially
+// it looks like a typical Fortran expression (or statement), except:
+// - the only variables allowed in it are OpenMP special variables, the
+//   exact set of these variables depends on the specific case of the
+//   stylized expression
+// - the special OpenMP variables present may assume one or more types,
+//   and the expression should be semantically valid for each type.
+//
+// The stylized expression can be thought of as a template, which will be
+// instantiated for each type provided somewhere in the context in which
+// the stylized expression appears.
+//
+// AST nodes:
+// - OmpStylizedExpression: contains the source string for the expression,
+//   plus the list of instances (OmpStylizedInstance).
+// - OmpStylizedInstance: corresponds to the instantiation of the stylized
+//   expression for a specific type. The way that the type is specified is
+//   by creating declarations (OmpStylizedDeclaration) for the special
+//   variables. Together with the AST tree corresponding to the stylized
+//   expression the instantiation has enough information for semantic
+//   analysis. Each instance has its own scope, and the special variables
+//   have their own Symbol's (local to the scope).
+// - OmpStylizedDeclaration: encapsulates the information that the visitors
+//   in resolve-names can use to "emulate" a declaration for a special
+//   variable and allow name resolution in the instantiation AST to work.
+//
+// Implementation specifics:
+// The semantic analysis stores "evaluate::Expr" in each AST node rooted
+// in parser::Expr (in the typedExpr member). The evaluate::Expr is specific
+// to a given type, and so to allow different types for a given expression,
+// for each type a separate copy of the parser::Expr subtree is created.
+// Normally, AST nodes are non-copyable (copy-ctor is deleted), so to create
+// several copies of a subtree, the same source string is parsed several
+// times. The ParseState member in OmpStylizedExpression is the parser state
+// immediately before the stylized expression.
+//
+// Initially, when OmpStylizedExpression is first created, the expression is
+// parsed as if it was an actual code, but this parsing is only done to
+// establish where the stylized expression ends (in the source). The source
+// and the initial parser state are stored in the object, and the instance
+// list is empty.
+// Once the parsing of the containing OmpDirectiveSpecification completes,
+// a post-processing "parser" (OmpStylizedInstanceCreator) executes. This
+// post-processor examines the directive specification to see if it expects
+// any stylized expressions to be contained in it, and then instantiates
+// them for each such directive.
+
+template <typename A> struct NeverParser {
+  using resultType = A;
+  std::optional<resultType> Parse(ParseState &state) const {
+    // Always fail, but without any messages.
+    return std::nullopt;
+  }
+};
+
+template <typename A> constexpr auto never() { return NeverParser<A>{}; }
+
+// Parser for optional<T> which always succeeds and returns std::nullptr.
+// It's only needed to produce "std::optional<CallStmt::Chevrons>" in
+// CallStmt.
+template <typename A, typename B = void> struct NullParser;
+template <typename B> struct NullParser<std::optional<B>> {
+  using resultType = std::optional<B>;
+  std::optional<resultType> Parse(ParseState &) const {
+    return resultType{std::nullopt};
+  }
+};
+
+template <typename A> constexpr auto null() { return NullParser<A>{}; }
+
+// OmpStylizedDeclaration and OmpStylizedInstance are helper classes, and
+// don't correspond to anything in the source. Their parsers should still
+// exist, but they should never be executed.
+TYPE_PARSER(construct<OmpStylizedDeclaration>(never<OmpStylizedDeclaration>()))
+TYPE_PARSER(construct<OmpStylizedInstance>(never<OmpStylizedInstance>()))
+
+TYPE_PARSER( //
+    construct<OmpStylizedInstance::Instance>(Parser<AssignmentStmt>{}) ||
+    construct<OmpStylizedInstance::Instance>(
+        sourced(construct<CallStmt>(Parser<ProcedureDesignator>{},
+            null<std::optional<CallStmt::Chevrons>>(),
+            parenthesized(optionalList(actualArgSpec))))) ||
+    construct<OmpStylizedInstance::Instance>(indirect(expr)))
+
+struct OmpStylizedExpressionParser {
+  using resultType = OmpStylizedExpression;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    auto *saved{new ParseState(state)};
+    auto getSource{verbatim(Parser<OmpStylizedInstance::Instance>{} >> ok)};
+    if (auto &&ok{getSource.Parse(state)}) {
+      OmpStylizedExpression result{std::list<OmpStylizedInstance>{}};
+      result.source = ok->source;
+      result.state = saved;
+      // result.v remains empty
+      return std::move(result);
+    }
+    delete saved;
+    return std::nullopt;
+  }
+};
+
+static void Instantiate(OmpStylizedExpression &ose,
+    llvm::ArrayRef<const OmpTypeName *> types, llvm::ArrayRef<CharBlock> vars) {
+  // 1. For each var in the vars list, declare it with the corresponding
+  //    type from types.
+  // 2. Run the parser to get the AST for the stylized expression.
+  // 3. Create OmpStylizedInstance and append it to the list in ose.
+  assert(types.size() == vars.size() && "List size mismatch");
+  // A ParseState object is irreversibly modified during parsing (in
+  // particular, it cannot be rewound to an earlier position in the source).
+  // Because of that we need to create a local copy for each instantiation.
+  // If rewinding was possible, we could just use the current one, and we
+  // wouldn't need to save it in the AST node.
+  ParseState state{DEREF(ose.state)};
+
+  std::list<OmpStylizedDeclaration> decls;
+  for (auto [type, var] : llvm::zip_equal(types, vars)) {
+    decls.emplace_back(OmpStylizedDeclaration{
+        common::Reference(*type), MakeEntityDecl(Name{var})});
+  }
+
+  if (auto &&instance{Parser<OmpStylizedInstance::Instance>{}.Parse(state)}) {
+    ose.v.emplace_back(
+        OmpStylizedInstance{std::move(decls), std::move(*instance)});
+  }
+}
+
+static void InstantiateForTypes(OmpStylizedExpression &ose,
+    const OmpTypeNameList &typeNames, llvm::ArrayRef<CharBlock> vars) {
+  // For each type in the type list, declare all variables in vars with
+  // that type, and complete the instantiation.
+  for (const OmpTypeName &t : typeNames.v) {
+    std::vector<const OmpTypeName *> types(vars.size(), &t);
+    Instantiate(ose, types, vars);
+  }
+}
+
+static void InstantiateDeclareReduction(OmpDirectiveSpecification &spec) {
+  // There can be arguments/clauses that don't make sense, that analysis
+  // is left until semantic checks. Tolerate any unexpected stuff.
+  auto *rspec{GetFirstArgument<OmpReductionSpecifier>(spec)};
+  if (!rspec) {
+    return;
+  }
+
+  const OmpTypeNameList *typeNames{nullptr};
+
+  if (auto *cexpr{
+          const_cast<OmpCombinerExpression *>(GetCombinerExpr(*rspec))}) {
+    typeNames = &std::get<OmpTypeNameList>(rspec->t);
+
+    InstantiateForTypes(*cexpr, *typeNames, OmpCombinerExpression::Variables());
+    delete cexpr->state;
+    cexpr->state = nullptr;
+  } else {
+    // If there are no types, there is nothing else to do.
+    return;
+  }
+
+  for (const OmpClause &clause : spec.Clauses().v) {
+    llvm::omp::Clause id{clause.Id()};
+    if (id == llvm::omp::Clause::OMPC_initializer) {
+      if (auto *iexpr{const_cast<OmpInitializerExpression *>(
+              GetInitializerExpr(clause))}) {
+        InstantiateForTypes(
+            *iexpr, *typeNames, OmpInitializerExpression::Variables());
+        delete iexpr->state;
+        iexpr->state = nullptr;
+      }
+    }
+  }
+}
+
+static void InstantiateStylizedDirective(OmpDirectiveSpecification &spec) {
+  const OmpDirectiveName &dirName{spec.DirName()};
+  if (dirName.v == llvm::omp::Directive::OMPD_declare_reduction) {
+    InstantiateDeclareReduction(spec);
+  }
+}
+
+template <typename P,
+    typename = std::enable_if_t<
+        std::is_same_v<typename P::resultType, OmpDirectiveSpecification>>>
+struct OmpStylizedInstanceCreator {
+  using resultType = OmpDirectiveSpecification;
+  constexpr OmpStylizedInstanceCreator(P p) : parser_(p) {}
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    if (auto &&spec{parser_.Parse(state)}) {
+      InstantiateStylizedDirective(*spec);
+      return std::move(spec);
+    }
+    return std::nullopt;
+  }
+
+private:
+  const P parser_;
+};
+
+template <typename P>
+OmpStylizedInstanceCreator(P) -> OmpStylizedInstanceCreator<P>;
+
+// --- Parsers for types ----------------------------------------------
+
+TYPE_PARSER( //
+    sourced(construct<OmpTypeName>(Parser<DeclarationTypeSpec>{})) ||
+    sourced(construct<OmpTypeName>(Parser<TypeSpec>{})))
+
 // --- Parsers for arguments ------------------------------------------
 
 // At the moment these are only directive arguments. This is needed for
@@ -366,10 +580,6 @@ struct OmpArgumentListParser {
   }
 };
 
-TYPE_PARSER( //
-    construct<OmpTypeName>(Parser<DeclarationTypeSpec>{}) ||
-    construct<OmpTypeName>(Parser<TypeSpec>{}))
-
 // 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list)
 TYPE_PARSER(construct<OmpReductionIdentifier>(Parser<DefinedOperator>{}) ||
     construct<OmpReductionIdentifier>(Parser<ProcedureDesignator>{}))
@@ -1065,7 +1275,8 @@ TYPE_PARSER(construct<OmpOtherwiseClause>(
 
 TYPE_PARSER(construct<OmpWhenClause>(
     maybe(nonemptyList(Parser<OmpWhenClause::Modifier>{}) / ":"),
-    maybe(indirect(Parser<OmpDirectiveSpecification>{}))))
+    maybe(indirect(
+        OmpStylizedInstanceCreator(Parser<OmpDirectiveSpecification>{})))))
 
 // OMP 5.2 12.6.1 grainsize([ prescriptiveness :] scalar-integer-expression)
 TYPE_PARSER(construct<OmpGrainsizeClause>(
@@ -1777,12 +1988,7 @@ TYPE_PARSER(
             Parser<OpenMPInteropConstruct>{})) /
     endOfLine)
 
-TYPE_PARSER(construct<OmpInitializerProc>(Parser<ProcedureDesignator>{},
-    parenthesized(many(maybe(","_tok) >> Parser<ActualArgSpec>{}))))
-
-TYPE_PARSER(construct<OmpInitializerClause>(
-    construct<OmpInitializerClause>(assignmentStmt) ||
-    construct<OmpInitializerClause>(Parser<OmpInitializerProc>{})))
+TYPE_PARSER(construct<OmpInitializerClause>(Parser<OmpInitializerExpression>{}))
 
 // OpenMP 5.2: 7.5.4 Declare Variant directive
 TYPE_PARSER(sourced(construct<OmpDeclareVariantDirective>(
@@ -1794,7 +2000,7 @@ TYPE_PARSER(sourced(construct<OmpDeclareVariantDirective>(
 TYPE_PARSER(sourced(construct<OpenMPDeclareReductionConstruct>(
     predicated(Parser<OmpDirectiveName>{},
         IsDirective(llvm::omp::Directive::OMPD_declare_reduction)) >=
-    Parser<OmpDirectiveSpecification>{})))
+    OmpStylizedInstanceCreator(Parser<OmpDirectiveSpecification>{}))))
 
 // 2.10.6 Declare Target Construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareTargetConstruct>(
@@ -1832,8 +2038,8 @@ TYPE_PARSER(sourced(construct<OpenMPDeclareMapperConstruct>(
         IsDirective(llvm::omp::Directive::OMPD_declare_mapper)) >=
     Parser<OmpDirectiveSpecification>{})))
 
-TYPE_PARSER(construct<OmpCombinerExpression>(Parser<AssignmentStmt>{}) ||
-    construct<OmpCombinerExpression>(Parser<FunctionReference>{}))
+TYPE_PARSER(construct<OmpCombinerExpression>(OmpStylizedExpressionParser{}))
+TYPE_PARSER(construct<OmpInitializerExpression>(OmpStylizedExpressionParser{}))
 
 TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>(
     OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical})))
diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp
index 937a17f..95ad3f6 100644
--- a/flang/lib/Parser/openmp-utils.cpp
+++ b/flang/lib/Parser/openmp-utils.cpp
@@ -74,4 +74,16 @@ const BlockConstruct *GetFortranBlockConstruct(
   return nullptr;
 }
 
+const OmpCombinerExpression *GetCombinerExpr(
+    const OmpReductionSpecifier &rspec) {
+  return addr_if(std::get<std::optional<OmpCombinerExpression>>(rspec.t));
+}
+
+const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) {
+  if (auto *wrapped{std::get_if<OmpClause::Initializer>(&init.u)}) {
+    return &wrapped->v.v;
+  }
+  return nullptr;
+}
+
 } // namespace Fortran::parser::omp
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index 8cbaa39..ad0016e 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -11,6 +11,7 @@
 #include "flang/Common/indirection.h"
 #include "flang/Parser/tools.h"
 #include "flang/Parser/user-state.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Frontend/OpenMP/OMP.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -430,4 +431,30 @@ const OmpClauseList &OmpDirectiveSpecification::Clauses() const {
   }
   return empty;
 }
+
+static bool InitCharBlocksFromStrings(llvm::MutableArrayRef<CharBlock> blocks,
+    llvm::ArrayRef<std::string> strings) {
+  for (auto [i, n] : llvm::enumerate(strings)) {
+    blocks[i] = CharBlock(n);
+  }
+  return true;
+}
+
+// The names should have static storage duration. Keep these names
+// in a sigle place.
+llvm::ArrayRef<CharBlock> OmpCombinerExpression::Variables() {
+  static std::string names[]{"omp_in", "omp_out"};
+  static CharBlock vars[std::size(names)];
+
+  [[maybe_unused]] static bool init = InitCharBlocksFromStrings(vars, names);
+  return vars;
+}
+
+llvm::ArrayRef<CharBlock> OmpInitializerExpression::Variables() {
+  static std::string names[]{"omp_orig", "omp_priv"};
+  static CharBlock vars[std::size(names)];
+
+  [[maybe_unused]] static bool init = InitCharBlocksFromStrings(vars, names);
+  return vars;
+}
 } // namespace Fortran::parser
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 20a8d2a..9b38cfc 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2095,15 +2095,13 @@ public:
 
   // OpenMP Clauses & Directives
   void Unparse(const OmpArgumentList &x) { Walk(x.v, ", "); }
+  void Unparse(const OmpTypeNameList &x) { Walk(x.v, ", "); }
 
   void Unparse(const OmpBaseVariantNames &x) {
     Walk(std::get<0>(x.t)); // OmpObject
     Put(":");
     Walk(std::get<1>(x.t)); // OmpObject
   }
-  void Unparse(const OmpTypeNameList &x) { //
-    Walk(x.v, ",");
-  }
   void Unparse(const OmpMapperSpecifier &x) {
     const auto &mapperName{std::get<std::string>(x.t)};
     if (mapperName.find(llvm::omp::OmpDefaultMapperName) == std::string::npos) {
@@ -2202,6 +2200,15 @@ public:
     unsigned ompVersion{langOpts_.OpenMPVersion};
     Word(llvm::omp::getOpenMPDirectiveName(x.v, ompVersion));
   }
+  void Unparse(const OmpStylizedDeclaration &x) {
+    // empty
+  }
+  void Unparse(const OmpStylizedExpression &x) { //
+    Put(x.source.ToString());
+  }
+  void Unparse(const OmpStylizedInstance &x) {
+    // empty
+  }
   void Unparse(const OmpIteratorSpecifier &x) {
     Walk(std::get<TypeDeclarationStmt>(x.t));
     Put(" = ");
@@ -2511,29 +2518,11 @@ public:
   void Unparse(const OpenMPCriticalConstruct &x) {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
-  void Unparse(const OmpInitializerProc &x) {
-    Walk(std::get<ProcedureDesignator>(x.t));
-    Put("(");
-    Walk(std::get<std::list<ActualArgSpec>>(x.t));
-    Put(")");
-  }
-  void Unparse(const OmpInitializerClause &x) {
-    // Don't let the visitor go to the normal AssignmentStmt Unparse function,
-    // it adds an extra newline that we don't want.
-    if (const auto *assignment{std::get_if<AssignmentStmt>(&x.u)}) {
-      Walk(assignment->t, " = ");
-    } else {
-      Walk(x.u);
-    }
+  void Unparse(const OmpInitializerExpression &x) {
+    Unparse(static_cast<const OmpStylizedExpression &>(x));
   }
   void Unparse(const OmpCombinerExpression &x) {
-    // Don't let the visitor go to the normal AssignmentStmt Unparse function,
-    // it adds an extra newline that we don't want.
-    if (const auto *assignment{std::get_if<AssignmentStmt>(&x.u)}) {
-      Walk(assignment->t, " = ");
-    } else {
-      Walk(x.u);
-    }
+    Unparse(static_cast<const OmpStylizedExpression &>(x));
   }
   void Unparse(const OpenMPDeclareReductionConstruct &x) {
     BeginOpenMP();
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 196755e..628068f 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -26,6 +26,8 @@
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Support/Flags.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/OpenMP/OMP.h.inc"
 #include "llvm/Support/Debug.h"
 #include <list>
@@ -453,6 +455,21 @@ public:
     return true;
   }
 
+  bool Pre(const parser::OmpStylizedDeclaration &x) {
+    static llvm::StringMap<Symbol::Flag> map{
+        {"omp_in", Symbol::Flag::OmpInVar},
+        {"omp_orig", Symbol::Flag::OmpOrigVar},
+        {"omp_out", Symbol::Flag::OmpOutVar},
+        {"omp_priv", Symbol::Flag::OmpPrivVar},
+    };
+    if (auto &name{std::get<parser::ObjectName>(x.var.t)}; name.symbol) {
+      if (auto found{map.find(name.ToString())}; found != map.end()) {
+        ResolveOmp(name, found->second,
+            const_cast<Scope &>(DEREF(name.symbol).owner()));
+      }
+    }
+    return false;
+  }
   bool Pre(const parser::OmpMetadirectiveDirective &x) {
     PushContext(x.v.source, llvm::omp::Directive::OMPD_metadirective);
     return true;
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 93faba7..0e6d4c7 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1605,6 +1605,12 @@ public:
     Post(static_cast<const parser::OmpDirectiveSpecification &>(x));
   }
 
+  void Post(const parser::OmpTypeName &);
+  bool Pre(const parser::OmpStylizedDeclaration &);
+  void Post(const parser::OmpStylizedDeclaration &);
+  bool Pre(const parser::OmpStylizedInstance &);
+  void Post(const parser::OmpStylizedInstance &);
+
   bool Pre(const parser::OpenMPDeclareMapperConstruct &x) {
     AddOmpSourceRange(x.source);
     return true;
@@ -1615,18 +1621,6 @@ public:
     return true;
   }
 
-  bool Pre(const parser::OmpInitializerProc &x) {
-    auto &procDes = std::get<parser::ProcedureDesignator>(x.t);
-    auto &name = std::get<parser::Name>(procDes.u);
-    auto *symbol{FindSymbol(NonDerivedTypeScope(), name)};
-    if (!symbol) {
-      context().Say(name.source,
-          "Implicit subroutine declaration '%s' in DECLARE REDUCTION"_err_en_US,
-          name.source);
-    }
-    return true;
-  }
-
   bool Pre(const parser::OmpDeclareVariantDirective &x) {
     AddOmpSourceRange(x.source);
     return true;
@@ -1772,14 +1766,6 @@ public:
     messageHandler().set_currStmtSource(std::nullopt);
   }
 
-  bool Pre(const parser::OmpTypeName &x) {
-    BeginDeclTypeSpec();
-    return true;
-  }
-  void Post(const parser::OmpTypeName &x) { //
-    EndDeclTypeSpec();
-  }
-
   bool Pre(const parser::OpenMPConstruct &x) {
     // Indicate that the current directive is not a declarative one.
     declaratives_.push_back(nullptr);
@@ -1835,6 +1821,30 @@ void OmpVisitor::Post(const parser::OmpBlockConstruct &x) {
   }
 }
 
+void OmpVisitor::Post(const parser::OmpTypeName &x) {
+  x.declTypeSpec = GetDeclTypeSpec();
+}
+
+bool OmpVisitor::Pre(const parser::OmpStylizedDeclaration &x) {
+  BeginDecl();
+  Walk(x.type.get());
+  Walk(x.var);
+  return true;
+}
+
+void OmpVisitor::Post(const parser::OmpStylizedDeclaration &x) { //
+  EndDecl();
+}
+
+bool OmpVisitor::Pre(const parser::OmpStylizedInstance &x) {
+  PushScope(Scope::Kind::OtherConstruct, nullptr);
+  return true;
+}
+
+void OmpVisitor::Post(const parser::OmpStylizedInstance &x) { //
+  PopScope();
+}
+
 bool OmpVisitor::Pre(const parser::OmpMapClause &x) {
   auto &mods{OmpGetModifiers(x)};
   if (auto *mapper{OmpGetUniqueModifier<parser::OmpMapper>(mods)}) {
@@ -1969,51 +1979,20 @@ void OmpVisitor::ProcessReductionSpecifier(
     }
   }
 
-  auto &typeList{std::get<parser::OmpTypeNameList>(spec.t)};
-
-  // Create a temporary variable declaration for the four variables
-  // used in the reduction specifier and initializer (omp_out, omp_in,
-  // omp_priv and omp_orig), with the type in the  typeList.
-  //
-  // In theory it would be possible to create only variables that are
-  // actually used, but that requires walking the entire parse-tree of the
-  // expressions, and finding the relevant variables [there may well be other
-  // variables involved too].
-  //
-  // This allows doing semantic analysis where the type is a derived type
-  // e.g omp_out%x = omp_out%x + omp_in%x.
-  //
-  // These need to be temporary (in their own scope). If they are created
-  // as variables in the outer scope, if there's more than one type in the
-  // typelist, duplicate symbols will be reported.
-  const parser::CharBlock ompVarNames[]{
-      {"omp_in", 6}, {"omp_out", 7}, {"omp_priv", 8}, {"omp_orig", 8}};
-
-  for (auto &t : typeList.v) {
-    PushScope(Scope::Kind::OtherConstruct, nullptr);
-    BeginDeclTypeSpec();
-    // We need to walk t.u because Walk(t) does it's own BeginDeclTypeSpec.
-    Walk(t.u);
+  reductionDetails->AddDecl(declaratives_.back());
 
-    // Only process types we can find. There will be an error later on when
-    // a type isn't found.
-    if (const DeclTypeSpec *typeSpec{GetDeclTypeSpec()}) {
-      reductionDetails->AddType(*typeSpec);
+  // Do not walk OmpTypeNameList. The types on the list will be visited
+  // during procesing of OmpCombinerExpression.
+  Walk(std::get<std::optional<parser::OmpCombinerExpression>>(spec.t));
+  Walk(clauses);
 
-      for (auto &nm : ompVarNames) {
-        ObjectEntityDetails details{};
-        details.set_type(*typeSpec);
-        MakeSymbol(nm, Attrs{}, std::move(details));
-      }
+  for (auto &type : std::get<parser::OmpTypeNameList>(spec.t).v) {
+    // The declTypeSpec can be null if there is some semantic error.
+    if (type.declTypeSpec) {
+      reductionDetails->AddType(*type.declTypeSpec);
     }
-    EndDeclTypeSpec();
-    Walk(std::get<std::optional<parser::OmpCombinerExpression>>(spec.t));
-    Walk(clauses);
-    PopScope();
   }
 
-  reductionDetails->AddDecl(declaratives_.back());
-
   if (!symbol) {
     symbol = &MakeSymbol(mangledName, Attrs{}, std::move(*reductionDetails));
   }
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 5182950..59af58d 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1998,6 +1998,18 @@ implicit none
 
   ! TMA Operations
 
+  interface barrier_arrive
+    attributes(device) function barrier_arrive(barrier) result(token)
+      integer(8), shared :: barrier
+      integer(8) :: token
+    end function
+    attributes(device) function barrier_arrive_cnt(barrier, count) result(token)
+      integer(8), shared :: barrier
+      integer(4), value :: count
+      integer(8) :: token
+    end function
+  end interface
+
   interface 
     attributes(device) subroutine barrier_init(barrier, count)
       integer(8), shared :: barrier
@@ -2005,15 +2017,18 @@ implicit none
     end subroutine
   end interface
 
-  interface barrier_arrive
-    attributes(device) function barrier_arrive(barrier) result(token)
+  interface
+    attributes(device) integer function barrier_try_wait(barrier, token)
       integer(8), shared :: barrier
-      integer(8) :: token
+      integer(8), value  :: token
     end function
-    attributes(device) function barrier_arrive_cnt(barrier, count) result(token)
+  end interface
+  
+  interface
+    attributes(device) integer function barrier_try_wait_sleep(barrier, token, ns)
       integer(8), shared :: barrier
-      integer(4), value :: count
-      integer(8) :: token
+      integer(8), value  :: token
+      integer(4), value  :: ns
     end function
   end interface
 
@@ -2032,7 +2047,13 @@ implicit none
     end subroutine
   end interface
 
+  ! --------------------
+  ! Bulk load functions
+  ! --------------------
+
   ! Generic load, count is in bytes
+  ! -------------------------------
+
   interface
     attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes)
       !dir$ ignore_tkr src, dst
@@ -2043,6 +2064,74 @@ implicit none
     end subroutine
   end interface
 
+  ! Load specific types, count is in elements
+  ! -----------------------------------------
+
+  interface tma_bulk_load
+    attributes(device) subroutine tma_bulk_ldc4(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      complex(4), device :: src(*)
+      complex(4), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldc8(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      complex(8), device :: src(*)
+      complex(8), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+  
+    attributes(device) subroutine tma_bulk_ldi4(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      integer(4), device :: src(*)
+      integer(4), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldi8(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      integer(8), device :: src(*)
+      integer(8), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldr2(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      real(2), device :: src(*)
+      real(2), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldr4(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      real(4), device :: src(*)
+      real(4), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_ldr8(barrier, src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: barrier
+      real(8), device :: src(*)
+      real(8), shared :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+  end interface
+
+  ! --------------------
+  ! Bulk Store functions
+  ! --------------------
+
+  ! Generic store, count is in bytes
+  ! --------------------------------
+
   interface
     attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes)
       !dir$ ignore_tkr src, dst
@@ -2052,6 +2141,60 @@ implicit none
     end subroutine
   end interface
 
+  ! Load specific types, count is in elements
+  ! -----------------------------------------
+
+  interface tma_bulk_store
+    attributes(device) subroutine tma_bulk_store_c4(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      complex(4), shared :: src(*)
+      complex(4), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_c8(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      complex(8), shared :: src(*)
+      complex(8), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_i4(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(4), shared :: src(*)
+      integer(4), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_i8(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      integer(8), shared :: src(*)
+      integer(8), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_r2(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      real(2), shared :: src(*)
+      real(2), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_r4(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      real(4), shared :: src(*)
+      real(4), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+
+    attributes(device) subroutine tma_bulk_store_r8(src, dst, nelems)
+      !dir$ ignore_tkr (r) src, (r) dst
+      real(8), shared :: src(*)
+      real(8), device :: dst(*)
+      integer(4), value :: nelems
+    end subroutine
+  end interface
+
 contains
 
   attributes(device) subroutine syncthreads()
diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
index 48fee10..5562e00 100644
--- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
+++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
@@ -108,3 +108,23 @@ module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.k
   }
 }
 
+// -----
+
+module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  gpu.module @testmod {
+    gpu.func @_QPtest(%arg0: complex<f32>) -> () kernel {
+      gpu.return
+    }
+  }
+  func.func @main(%arg0: complex<f32>) {
+    %0 = llvm.mlir.constant(0 : i64) : i64
+    %1 = llvm.mlir.constant(0 : i32) : i32
+    %2 = fir.alloca i64
+    %3 = cuf.stream_cast %2 : !fir.ref<i64>
+    %4 = gpu.launch_func async [%3] @testmod::@_QPtest blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %1 args(%arg0 : complex<f32>) {cuf.proc_attr = #cuf.cuda_proc<global>}
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @main
+// CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : !fir.vector<2:f32>) {cuf.proc_attr = #cuf.cuda_proc<global>}
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 5c4c3c6..8f35521 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -479,6 +479,8 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(device) subroutine testAtomicCasLoop(aa, n)
   integer :: a
@@ -492,3 +494,250 @@ end subroutine
 ! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32
 ! CHECK: %{{.*}} = arith.constant 1 : i32
 ! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32
+
+attributes(global) subroutine test_barrier_try_wait()
+  integer :: istat
+  integer(8), shared :: barrier1
+  integer(8) :: token
+  istat = barrier_try_wait(barrier1, token)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_barrier_try_wait()
+! CHECK: scf.while
+! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32
+
+attributes(global) subroutine test_barrier_try_wait_sleep()
+  integer :: istat
+  integer(8), shared :: barrier1
+  integer(8) :: token
+  integer(4) :: sleep_time
+  istat = barrier_try_wait_sleep(barrier1, token, sleep_time)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
+! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
+
+attributes(global) subroutine test_tma_bulk_load_c4(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  complex(4), device :: r8(n)
+  complex(4), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xcomplex<f32>>>, !fir.ref<complex<f32>>, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_c8(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  complex(8), device :: r8(n)
+  complex(8), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xcomplex<f64>>>, !fir.ref<complex<f64>>, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_i4(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  integer(4), device :: r8(n)
+  integer(4), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_i8(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  integer(8), device :: r8(n)
+  integer(8), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xi64>>, !fir.ref<i64>, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r2(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  real(2), device :: r8(n)
+  real(2), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf16>>, !fir.ref<f16>, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r4(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  real(4), device :: r8(n)
+  real(4), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf32>>, !fir.ref<f32>, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_load_r8(a, n)
+  integer(8), shared :: barrier1
+  integer, value :: n
+  real(8), device :: r8(n)
+  real(8), shared :: tmp(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_load(barrier1, r8(j), tmp, elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8
+! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
+! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
+! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
+! CHECK: %[[BARRIER_PTR:.*]] = fir.convert %[[BARRIER]]#0 : (!fir.ref<i64>) -> !llvm.ptr
+! CHECK: nvvm.inline_ptx "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" ro(%{{.*}}, %{{.*}}, %[[SIZE]], %[[BARRIER_PTR]] : !fir.ref<!fir.array<1024xf64>>, !fir.ref<f64>, i32, !llvm.ptr)
+! CHECK: nvvm.inline_ptx "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" ro(%[[BARRIER_PTR]], %[[SIZE]] : !llvm.ptr, i32)
+
+attributes(global) subroutine test_tma_bulk_store_c4(c, n)
+  integer, value :: n
+  complex(4), device :: c(n)
+  complex(4), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_c8(c, n)
+  integer, value :: n
+  complex(8), device :: c(n)
+  complex(8), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_i4(c, n)
+  integer, value :: n
+  integer(4), device :: c(n)
+  integer(4), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_i8(c, n)
+  integer, value :: n
+  integer(8), device :: c(n)
+  integer(8), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+
+attributes(global) subroutine test_tma_bulk_store_r2(c, n)
+  integer, value :: n
+  real(2), device :: c(n)
+  real(2), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_r4(c, n)
+  integer, value :: n
+  real(4), device :: c(n)
+  real(4), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
+
+attributes(global) subroutine test_tma_bulk_store_r8(c, n)
+  integer, value :: n
+  real(8), device :: c(n)
+  real(8), shared :: tmpa(1024)
+  integer(4) :: j, elem_count
+  call tma_bulk_store(tmpa, c(j), elem_count)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8
+! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.cp.async.bulk.wait_group 0
diff --git a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
index a682958..8856661 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-multi.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
@@ -26,7 +26,8 @@ program omp_examples
   type(tt) :: values(n), sum, prod, big, small
 
   !$omp declare reduction(+:tt:omp_out%r = omp_out%r + omp_in%r) initializer(omp_priv%r = 0)
-!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out%r = omp_out%r+omp_in%r) INITIALIZER(omp_priv%r = 0_4)
+!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out%r = omp_out%r + omp_in%r) INITIALIZER(om&
+!CHECK-NEXT: !$OMP&p_priv%r = 0)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -34,11 +35,39 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r'
+!PARSE-TREE: | | | | Variable = 'omp_out%r'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'r'
+!PARSE-TREE: | | | | Expr = 'omp_out%r+omp_in%r'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '0_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
 
   !$omp declare reduction(*:tt:omp_out%r = omp_out%r * omp_in%r) initializer(omp_priv%r = 1)
-!CHECK-NEXT: !$OMP DECLARE REDUCTION(*:tt: omp_out%r = omp_out%r*omp_in%r) INITIALIZER(omp_priv%r = 1_4)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION(*:tt: omp_out%r = omp_out%r * omp_in%r) INITIALIZER(om&
+!CHECK-NEXT: !$OMP&p_priv%r = 1)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -46,11 +75,39 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r'
+!PARSE-TREE: | | | | Variable = 'omp_out%r'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'r'
+!PARSE-TREE: | | | | Expr = 'omp_out%r*omp_in%r'
+!PARSE-TREE: | | | | | Multiply
+!PARSE-TREE: | | | | | | Expr = 'omp_out%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%r'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | Name = 'r'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '1_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: | Flags = None
 
   !$omp declare reduction(max:tt:omp_out = mymax(omp_out, omp_in)) initializer(omp_priv%r = 0)
-!CHECK-NEXT: !$OMP DECLARE REDUCTION(max:tt: omp_out = mymax(omp_out,omp_in)) INITIALIZER(omp_priv%r = 0_4)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION(max:tt: omp_out = mymax(omp_out, omp_in)) INITIALIZER(&
+!CHECK-NEXT: !$OMP&omp_priv%r = 0)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -58,11 +115,36 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'mymax(omp_out,omp_in)'
+!PARSE-TREE: | | | | | FunctionReference -> Call
+!PARSE-TREE: | | | | | | ProcedureDesignator -> Name = 'mymax'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_out'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_in'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> Name = 'omp_in'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=0._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '0_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
 
   !$omp declare reduction(min:tt:omp_out%r = min(omp_out%r, omp_in%r)) initializer(omp_priv%r = 1)
-!CHECK-NEXT: !$OMP DECLARE REDUCTION(min:tt: omp_out%r = min(omp_out%r,omp_in%r)) INITIALIZER(omp_priv%r = 1_4)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION(min:tt: omp_out%r = min(omp_out%r, omp_in%r)) INITIALI&
+!CHECK-NEXT: !$OMP&ZER(omp_priv%r = 1)
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -70,8 +152,38 @@ program omp_examples
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)'
+!PARSE-TREE: | | | | Variable = 'omp_out%r'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'r'
+!PARSE-TREE: | | | | Expr = 'min(omp_out%r,omp_in%r)'
+!PARSE-TREE: | | | | | FunctionReference -> Call
+!PARSE-TREE: | | | | | | ProcedureDesignator -> Name = 'min'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_out%r'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | Name = 'r'
+!PARSE-TREE: | | | | | | ActualArgSpec
+!PARSE-TREE: | | | | | | | ActualArg -> Expr = 'omp_in%r'
+!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | Name = 'r'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv%r=1._4'
+!PARSE-TREE: | | | Variable = 'omp_priv%r'
+!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | | Name = 'r'
+!PARSE-TREE: | | | Expr = '1_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: | Flags = None
 
   call random_number(values%r)
 
diff --git a/flang/test/Parser/OpenMP/declare-reduction-operator.f90 b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
index e4d07c8..0d337c1 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-operator.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
@@ -16,7 +16,8 @@ subroutine reduce_1 ( n, tts )
   type(tt) :: tts(n)
   type(tt2) :: tts2(n)
 
-!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out = tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)) INITIALIZER(omp_priv = tt(x=0_4,y=0_4))
+!CHECK: !$OMP DECLARE REDUCTION(+:tt: omp_out = tt(omp_out%x - omp_in%x , omp_out%y - &
+!CHECK: !$OMP&omp_in%y)) INITIALIZER(omp_priv = tt(0,0))
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -24,13 +25,60 @@ subroutine reduce_1 ( n, tts )
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)'
-
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | | StructureConstructor
+!PARSE-TREE: | | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | | Name = 'tt'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%x-omp_in%x'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%y-omp_in%y'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)'
+!PARSE-TREE: | | | Variable = 'omp_priv'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | Expr = 'tt(x=0_4,y=0_4)'
+!PARSE-TREE: | | | | StructureConstructor
+!PARSE-TREE: | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | Name = 'tt'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
   !$omp declare reduction(+ : tt :  omp_out = tt(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt(0,0))
 
   
-!CHECK: !$OMP DECLARE REDUCTION(+:tt2: omp_out = tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)) INITIALIZER(omp_priv = tt2(x=0._8,y=0._8)
+!CHECK: !$OMP DECLARE REDUCTION(+:tt2: omp_out = tt2(omp_out%x - omp_in%x , omp_out%y &
+!CHECK: !$OMP&- omp_in%y)) INITIALIZER(omp_priv = tt2(0,0))
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -38,9 +86,55 @@ subroutine reduce_1 ( n, tts )
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt2'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)'
-
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | | | | StructureConstructor
+!PARSE-TREE: | | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | | Name = 'tt2'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%x-omp_in%x'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | | ComponentDataSource -> Expr = 'omp_out%y-omp_in%y'
+!PARSE-TREE: | | | | | | | | Subtract
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_out%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | | | | | | | | | Expr = 'omp_in%y'
+!PARSE-TREE: | | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | | Name = 'y'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)'
+!PARSE-TREE: | | | Variable = 'omp_priv'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | Expr = 'tt2(x=0._8,y=0._8)'
+!PARSE-TREE: | | | | StructureConstructor
+!PARSE-TREE: | | | | | DerivedTypeSpec
+!PARSE-TREE: | | | | | | Name = 'tt2'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | | | | | ComponentSpec
+!PARSE-TREE: | | | | | | ComponentDataSource -> Expr = '0_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
   !$omp declare reduction(+ :tt2 :  omp_out = tt2(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt2(0,0))
   
   type(tt) :: diffp = tt( 0, 0 )
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
index 455fc17..f026f15 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse-with-symbols.f90
@@ -8,6 +8,6 @@ end
 
 !CHECK: !DEF: /f00 (Subroutine) Subprogram
 !CHECK: subroutine f00
-!CHECK: !$omp declare reduction(fred:integer,real: omp_out = omp_in+omp_out)
+!CHECK: !$omp declare reduction(fred:integer, real: omp_out = omp_in + omp_out)
 !CHECK: end subroutine
 
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
index 73d7ccf..7897eb0 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
@@ -19,7 +19,8 @@ function func(x, n, init)
      end subroutine initme
   end interface
 !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
-!CHECK: !$OMP DECLARE REDUCTION(red_add:INTEGER(KIND=4_4): omp_out = omp_out+omp_in) INITIALIZER(initme(omp_priv, 0_4))
+!CHECK: !$OMP DECLARE REDUCTION(red_add:INTEGER(KIND=4_4): omp_out=omp_out+omp_in) INITIA&
+!CHECKL !$OMP&LIZER(initme(omp_priv,0))
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
@@ -27,9 +28,31 @@ function func(x, n, init)
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'red_add'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> KindSelector -> Scalar -> Integer -> Constant -> Expr = '4_4'
 !PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '4'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerProc
-!PARSE-TREE: | | ProcedureDesignator -> Name = 'initme'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=omp_out+omp_in'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'omp_out+omp_in'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Expr = 'omp_in'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_in'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> CallStmt = 'CALL initme(omp_priv,0_4)'
+!PARSE-TREE: | | | Call
+!PARSE-TREE: | | | | ProcedureDesignator -> Name = 'initme'
+!PARSE-TREE: | | | | ActualArgSpec
+!PARSE-TREE: | | | | | ActualArg -> Expr = 'omp_priv'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | | ActualArgSpec
+!PARSE-TREE: | | | | | ActualArg -> Expr = '0_4'
+!PARSE-TREE: | | | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
 
   res=init
 !$omp simd reduction(red_add:res)
@@ -59,7 +82,8 @@ end function func
 !CHECK-LABEL: program main
 program main
   integer :: my_var
-!CHECK: !$OMP DECLARE REDUCTION(my_add_red:INTEGER: omp_out = omp_out+omp_in) INITIALIZER(omp_priv = 0_4)
+!CHECK: !$OMP DECLARE REDUCTION(my_add_red:INTEGER: omp_out = omp_out + omp_in) INITIA&
+!CHECK: !$OMP&LIZER(omp_priv=0)
 
   !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0)
   my_var = 0
@@ -74,5 +98,24 @@ end program main
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'my_add_red'
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in'
-!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=0_4'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out=omp_out+omp_in'
+!PARSE-TREE: | | | | Variable = 'omp_out'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | Expr = 'omp_out+omp_in'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Expr = 'omp_in'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'omp_in'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | OmpStylizedDeclaration
+!PARSE-TREE: | | Instance -> AssignmentStmt = 'omp_priv=0_4'
+!PARSE-TREE: | | | Variable = 'omp_priv'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'omp_priv'
+!PARSE-TREE: | | | Expr = '0_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '0'
+!PARSE-TREE: | Flags = None
diff --git a/flang/test/Parser/OpenMP/metadirective-dirspec.f90 b/flang/test/Parser/OpenMP/metadirective-dirspec.f90
index c373001..b64ceb1 100644
--- a/flang/test/Parser/OpenMP/metadirective-dirspec.f90
+++ b/flang/test/Parser/OpenMP/metadirective-dirspec.f90
@@ -105,8 +105,8 @@ end
 !UNPARSE:  TYPE :: tt2
 !UNPARSE:   REAL :: x
 !UNPARSE:  END TYPE
-!UNPARSE: !$OMP METADIRECTIVE WHEN(USER={CONDITION(.true._4)}: DECLARE REDUCTION(+:tt1,tt2: omp_out%x = omp_in%x+omp_out%x)&
-!UNPARSE: !$OMP&)
+!UNPARSE: !$OMP METADIRECTIVE WHEN(USER={CONDITION(.true._4)}: DECLARE REDUCTION(+:tt1, tt2: omp&
+!UNPARSE: !$OMP&_out%x = omp_in%x + omp_out%x))
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpMetadirectiveDirective
@@ -127,21 +127,44 @@ end
 !PARSE-TREE: | | | | | Name = 'tt1'
 !PARSE-TREE: | | | | OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | | | Name = 'tt2'
-!PARSE-TREE: | | | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
-!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | | | Name = 'x'
-!PARSE-TREE: | | | | | Expr = 'omp_in%x+omp_out%x'
-!PARSE-TREE: | | | | | | Add
-!PARSE-TREE: | | | | | | | Expr = 'omp_in%x'
-!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_in'
-!PARSE-TREE: | | | | | | | | | Name = 'x'
-!PARSE-TREE: | | | | | | | Expr = 'omp_out%x'
-!PARSE-TREE: | | | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | Instance -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | Variable = 'omp_out%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | | Add
+!PARSE-TREE: | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | OmpStylizedInstance
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | | | Instance -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | Variable = 'omp_out%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | | | | Add
+!PARSE-TREE: | | | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | | | Name = 'x'
 !PARSE-TREE: | | | OmpClauseList ->
+!PARSE-TREE: | | | Flags = None
 
 subroutine f04
   !$omp metadirective when(user={condition(.true.)}: &
diff --git a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
index 39e8f05..50a38c6 100644
--- a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
+++ b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
@@ -79,7 +79,7 @@ end
 !UNPARSE:  TYPE :: t
 !UNPARSE:   INTEGER :: x
 !UNPARSE:  END TYPE
-!UNPARSE: !$OMP DECLARE_REDUCTION(+:t: omp_out%x = omp_out%x+omp_in%x)
+!UNPARSE: !$OMP DECLARE_REDUCTION(+:t: omp_out%x = omp_out%x + omp_in%x)
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct -> OmpDirectiveSpecification
@@ -88,21 +88,24 @@ end
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 't'
-!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x'
-!PARSE-TREE: | | | Variable = 'omp_out%x'
-!PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | Name = 'x'
-!PARSE-TREE: | | | Expr = 'omp_out%x+omp_in%x'
-!PARSE-TREE: | | | | Add
-!PARSE-TREE: | | | | | Expr = 'omp_out%x'
-!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_out'
-!PARSE-TREE: | | | | | | | Name = 'x'
-!PARSE-TREE: | | | | | Expr = 'omp_in%x'
-!PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent
-!PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_in'
-!PARSE-TREE: | | | | | | | Name = 'x'
+!PARSE-TREE: | | OmpCombinerExpression -> OmpStylizedInstance
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | OmpStylizedDeclaration
+!PARSE-TREE: | | | Instance -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x'
+!PARSE-TREE: | | | | Variable = 'omp_out%x'
+!PARSE-TREE: | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | Name = 'x'
+!PARSE-TREE: | | | | Expr = 'omp_out%x+omp_in%x'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'omp_out%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_out'
+!PARSE-TREE: | | | | | | | | Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'omp_in%x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> StructureComponent
+!PARSE-TREE: | | | | | | | | DataRef -> Name = 'omp_in'
+!PARSE-TREE: | | | | | | | | Name = 'x'
 !PARSE-TREE: | OmpClauseList ->
 !PARSE-TREE: | Flags = None
 
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-error.f90 b/flang/test/Semantics/OpenMP/declare-reduction-error.f90
deleted file mode 100644
index 21f5cc1..0000000
--- a/flang/test/Semantics/OpenMP/declare-reduction-error.f90
+++ /dev/null
@@ -1,11 +0,0 @@
-! RUN: not %flang_fc1 -emit-obj -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
-
-subroutine initme(x,n)
-  integer x,n
-  x=n
-end subroutine initme
-
-subroutine subr
-  !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
-  !CHECK: error: Implicit subroutine declaration 'initme' in DECLARE REDUCTION
-end subroutine subr
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-functions.f90 b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
index 000d323..89e0771 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
@@ -57,9 +57,10 @@ contains
 !CHECK: adder: UserReductionDetails TYPE(two)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
     
   
     !$omp simd reduction(adder:res)
@@ -101,14 +102,16 @@ contains
 !CHECK: adder: UserReductionDetails TYPE(two) TYPE(three)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three)
-!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three)
-!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three)
-!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three)
+!CHECK: omp_out size=24 offset=24: ObjectEntity type: TYPE(three)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=24 offset=0: ObjectEntity type: TYPE(three)
+!CHECK: omp_priv size=24 offset=24: ObjectEntity type: TYPE(three)
 
     !$omp simd reduction(adder:res3)
     do i=1,n
@@ -135,9 +138,10 @@ contains
 !CHECK: op.+: UserReductionDetails TYPE(two)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
     
   
     !$omp simd reduction(+:res)
@@ -163,14 +167,16 @@ contains
 !CHECK: op.+: UserReductionDetails TYPE(two) TYPE(three)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
-!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
-!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
-!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=8: ObjectEntity type: TYPE(two)
 !CHECK: OtherConstruct scope
 !CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three)
-!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three)
-!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three)
-!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three)
+!CHECK: omp_out size=24 offset=24: ObjectEntity type: TYPE(three)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=24 offset=0: ObjectEntity type: TYPE(three)
+!CHECK: omp_priv size=24 offset=24: ObjectEntity type: TYPE(three)
 
     !$omp simd reduction(+:res3)
     do i=1,n
@@ -183,6 +189,7 @@ contains
     enddo
     res%t2 = res2
     res%t3 = res3
+    funcBtwothree = res
   end function funcBtwothree
 
   !! This is checking a special case, where a reduction is declared inside a
@@ -191,11 +198,12 @@ contains
   pure logical function reduction()
 !CHECK: reduction size=4 offset=0: ObjectEntity funcResult type: LOGICAL(4)
 !CHECK: rr: UserReductionDetails INTEGER(4)
-!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4)
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4)
     !$omp declare reduction (rr : integer : omp_out = omp_out + omp_in) initializer (omp_priv = 0)
     reduction = .false.
   end function reduction
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-logical.f90 b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
index 7ab7cad..87fcecd 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
@@ -18,9 +18,10 @@ contains
 !CHECK: op.AND: UserReductionDetails TYPE(logicalwrapper)
 !CHECK OtherConstruct scope
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(logicalwrapper)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(logicalwrapper)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK OtherConstruct scope
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper)
   
     !$omp simd reduction(.AND.:res)
     do i=1,n
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90 b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
index 0882de8..763179c 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
@@ -6,13 +6,13 @@
 !type::t1
 !integer(4)::val
 !endtype
-!!$OMP DECLARE REDUCTION(*:t1:omp_out=omp_out*omp_in)INITIALIZER(omp_priv=&
-!!$OMP&t1(1))
+!!$OMP DECLARE REDUCTION(*:t1: omp_out=omp_out*omp_in) INITIALIZER(omp_priv=t1(&
+!!$OMP&1))
 !!$OMP METADIRECTIVE OTHERWISE(DECLARE REDUCTION(+:INTEGER))
-!!$OMP DECLARE REDUCTION(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)INITIALI&
-!!$OMP&ZER(omp_priv=t1(0))
-!!$OMP DECLARE REDUCTION(.mul.:t1:omp_out=omp_out.mul.omp_in)INITIALIZER(om&
-!!$OMP&p_priv=t1(1))
+!!$OMP DECLARE REDUCTION(.fluffy.:t1: omp_out=omp_out.fluffy.omp_in) INITIALIZE&
+!!$OMP&R(omp_priv=t1(0))
+!!$OMP DECLARE REDUCTION(.mul.:t1: omp_out=omp_out.mul.omp_in) INITIALIZER(omp_&
+!!$OMP&priv=t1(1))
 !interface operator(.mul.)
 !procedure::mul
 !end interface
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operator.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
index dc12332..5fc4205 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
@@ -11,11 +11,9 @@ module m1
 !$omp declare reduction(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)
 !CHECK: op.fluffy., PUBLIC: UserReductionDetails TYPE(t1)
 !CHECK: t1, PUBLIC: DerivedType components: val
-!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(t1)
 contains
   function my_mul(x, y)
     type (t1), intent (in) :: x, y
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
index 84dbe1a..e0006bf 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
@@ -64,9 +64,10 @@ program test_vector
 
 !CHECK: OtherConstruct scope:
 !CHECK: omp_in size=12 offset=0: ObjectEntity type: TYPE(vector)
-!CHECK: omp_orig size=12 offset=12: ObjectEntity type: TYPE(vector)
-!CHECK: omp_out size=12 offset=24: ObjectEntity type: TYPE(vector)
-!CHECK: omp_priv size=12 offset=36: ObjectEntity type: TYPE(vector)
+!CHECK: omp_out size=12 offset=12: ObjectEntity type: TYPE(vector)
+!CHECK: OtherConstruct scope:
+!CHECK: omp_orig size=12 offset=0: ObjectEntity type: TYPE(vector)
+!CHECK: omp_priv size=12 offset=12: ObjectEntity type: TYPE(vector)
 
   v2 = Vector(0.0, 0.0, 0.0)
   v1 = Vector(1.0, 2.0, 3.0)
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
index 9cd638d..115fe51 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
@@ -33,11 +33,12 @@ program test_omp_reduction
   !$omp declare reduction (.modmul. : t1 : omp_out = omp_out .modmul. omp_in) initializer(omp_priv = t1(1.0))
 !CHECK: op.modmul.: UserReductionDetails TYPE(t1)
 !CHECK: t1: Use from t1 in module1
-!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: TYPE(t1)
+!CHECK: OtherConstruct scope: size=8 alignment=4 sourceRange=0 bytes
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: TYPE(t1)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: TYPE(t1)
   result = t1(1.0)
   !$omp parallel do reduction(.modmul.:result)
   do i = 1, 10
diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90
index 1f39c57..c8dee5e 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction.f90
@@ -19,10 +19,12 @@ function func(x, n, init)
   !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
 !CHECK: red_add: UserReductionDetails
 !CHECK: Subprogram scope: initme
+!CHECK: OtherConstruct scope:
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4)
+!CHECK: OtherConstruct scope:
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4)
 !$omp simd reduction(red_add:res)
   do i=1,n
      res=res+x(i)
@@ -36,9 +38,11 @@ program main
   !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0)
 
 !CHECK: my_add_red: UserReductionDetails
+!CHECK: OtherConstruct scope:
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
-!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
-!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
-!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4)
+!CHECK: omp_out size=4 offset=4: ObjectEntity type: INTEGER(4)
+!CHECK: OtherConstruct scope:
+!CHECK: omp_orig size=4 offset=0: ObjectEntity type: INTEGER(4)
+!CHECK: omp_priv size=4 offset=4: ObjectEntity type: INTEGER(4)
   
 end program main
diff --git a/libc/utils/hdrgen/hdrgen/header.py b/libc/utils/hdrgen/hdrgen/header.py
index 2118db6..715d4b7 100644
--- a/libc/utils/hdrgen/hdrgen/header.py
+++ b/libc/utils/hdrgen/hdrgen/header.py
@@ -241,7 +241,5 @@ class HeaderFile:
         return {
             "name": self.name,
             "standards": self.standards,
-            "includes": [
-                str(file) for file in sorted({COMMON_HEADER} | self.includes())
-            ],
+            "includes": sorted(str(file) for file in {COMMON_HEADER} | self.includes()),
         }
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 25d33a9..980390c 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -76,8 +76,9 @@ Improvements and New Features
 - The ``std::{fill, fill_n}`` and ``std::ranges::{fill, fill_n}`` algorithms have been optimized for segmented iterators,
   resulting in a performance improvement of at least 10x for ``std::deque<int>`` iterators and
   ``std::join_view<std::vector<std::vector<int>>>`` iterators.
-- The ``std::generate`` and ``std::generate_n`` algorithms have been optimized for segmented iterators, resulting in a
-  performance improvement for ``std::deque<short>`` and ``std::join_view<vector<vector<short>>>`` iterators.
+- The ``std::{generate, generate_n}`` and ``std::ranges::generate_n`` algorithms have been optimized for segmented
+  iterators, resulting in a performance improvement for ``std::deque<short>`` and
+  ``std::join_view<vector<vector<short>>>`` iterators.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/include/__algorithm/generate_n.h b/libcxx/include/__algorithm/generate_n.h
index e9da133..23899e4 100644
--- a/libcxx/include/__algorithm/generate_n.h
+++ b/libcxx/include/__algorithm/generate_n.h
@@ -13,22 +13,34 @@
 #include <__config>
 #include <__functional/identity.h>
 #include <__utility/forward.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _OutputIterator, class _Size, class _Generator>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) {
+__generate_n(_OutputIterator __first, _Size __orig_n, _Generator& __gen) {
   using __iter_ref = decltype(*__first);
   __identity __proj;
   auto __f = [&](__iter_ref __element) { std::forward<__iter_ref>(__element) = __gen(); };
-  return std::__for_each_n(__first, __orig_n, __f, __proj);
+  return std::__for_each_n(std::move(__first), __orig_n, __f, __proj);
+}
+
+template <class _OutputIterator, class _Size, class _Generator>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) {
+  return std::__generate_n(std::move(__first), __orig_n, __gen);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_GENERATE_N_H
diff --git a/libcxx/include/__algorithm/ranges_generate_n.h b/libcxx/include/__algorithm/ranges_generate_n.h
index a318994..0cc9ce7 100644
--- a/libcxx/include/__algorithm/ranges_generate_n.h
+++ b/libcxx/include/__algorithm/ranges_generate_n.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H
 #define _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H
 
+#include <__algorithm/generate_n.h>
 #include <__concepts/constructible.h>
 #include <__concepts/invocable.h>
 #include <__config>
@@ -38,12 +39,7 @@ struct __generate_n {
     requires invocable<_Func&> && indirectly_writable<_OutIter, invoke_result_t<_Func&>>
   _LIBCPP_HIDE_FROM_ABI constexpr _OutIter
   operator()(_OutIter __first, iter_difference_t<_OutIter> __n, _Func __gen) const {
-    for (; __n > 0; --__n) {
-      *__first = __gen();
-      ++__first;
-    }
-
-    return __first;
+    return std::__generate_n(std::move(__first), __n, __gen);
   }
 };
 
diff --git a/libunwind/src/DwarfParser.hpp b/libunwind/src/DwarfParser.hpp
index dbd7d65..2b04ae2 100644
--- a/libunwind/src/DwarfParser.hpp
+++ b/libunwind/src/DwarfParser.hpp
@@ -842,12 +842,10 @@ bool CFI_Parser<A>::parseFDEInstructions(A &addressSpace,
             results->savedRegisters[UNW_AARCH64_RA_SIGN_STATE].value ^ 0x3;
         results->setRegisterValue(UNW_AARCH64_RA_SIGN_STATE, value,
                                   initialState);
-        // When calculating the value of the PC, it is assumed that the CFI
-        // instruction is placed before the signing instruction, however it is
-        // placed after. Because of this, we need to take into account the CFI
-        // instruction is one instruction call later than expected, and reduce
-        // the PC value by 4 bytes to compensate.
-        results->ptrAuthDiversifier = fdeInfo.pcStart + codeOffset - 0x4;
+        // When using Feat_PAuthLR, the PC value needs to be captured so that
+        // during unwinding, the correct PC value is used for re-authentication.
+        // It is assumed that the CFI is placed before the signing instruction.
+        results->ptrAuthDiversifier = fdeInfo.pcStart + codeOffset;
         _LIBUNWIND_TRACE_DWARF(
             "DW_CFA_AARCH64_negate_ra_state_with_pc(pc=0x%" PRIx64 ")\n",
             static_cast<uint64_t>(results->ptrAuthDiversifier));
diff --git a/lld/test/wasm/lto/relocation-model.ll b/lld/test/wasm/lto/relocation-model.ll
index 8fe198d..a042615 100644
--- a/lld/test/wasm/lto/relocation-model.ll
+++ b/lld/test/wasm/lto/relocation-model.ll
@@ -8,6 +8,11 @@
 ; RUN: wasm-ld %t.o -o %t_static.wasm -save-temps -r -mllvm -relocation-model=static
 ; RUN: llvm-readobj -r %t_static.wasm.lto.o | FileCheck %s --check-prefix=STATIC
 
+;; Linking with --unresolved-symbols=import-dynamic should also generate PIC
+;; code for external references.
+; RUN: wasm-ld %t.o -o %t_import.wasm -save-temps --experimental-pic --unresolved-symbols=import-dynamic
+; RUN: llvm-readobj -r %t_import.wasm.lto.o | FileCheck %s --check-prefix=PIC
+
 ; PIC: R_WASM_GLOBAL_INDEX_LEB foo
 ; STATIC: R_WASM_MEMORY_ADDR_LEB foo
 
diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp
index ae85f46..668cdf2 100644
--- a/lld/wasm/LTO.cpp
+++ b/lld/wasm/LTO.cpp
@@ -63,6 +63,12 @@ static lto::Config createConfig() {
     c.RelocModel = std::nullopt;
   else if (ctx.isPic)
     c.RelocModel = Reloc::PIC_;
+  else if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)
+    // With ImportDynamic we also need to use the PIC relocation model so that
+    // external symbols are references via the GOT.
+    // TODO(sbc): This should probably be Reloc::DynamicNoPIC, but the backend
+    // doesn't currently support that.
+    c.RelocModel = Reloc::PIC_;
   else
     c.RelocModel = Reloc::Static;
 
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 0db8c92..2eb1677 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -95,37 +95,31 @@ commands below.
 Windows
 *******
 
-* Visual Studio 2019.
-* The latest Windows SDK.
-* The Active Template Library (ATL).
-* `GnuWin32 <http://gnuwin32.sourceforge.net/>`_ for CoreUtils and Make.
-* `Python 3 <https://www.python.org/downloads/windows/>`_.  Make sure to (1) get
-  the x64 variant if that's what you're targeting and (2) install the debug
-  library if you want to build a debug lldb. The standalone installer is the
-  easiest way to get the debug library.
-* `Python Tools for Visual Studio
-  <https://github.com/Microsoft/PTVS/>`_. If you plan to debug test failures
-  or even write new tests at all, PTVS is an indispensable debugging
-  extension to VS that enables full editing and debugging support for Python
-  (including mixed native/managed debugging).
-* `SWIG for Windows <http://www.swig.org/download.html>`_
-
-The steps outlined here describes how to set up your system and install the
-required dependencies such that they can be found when needed during the build
-process. They only need to be performed once.
-
-#. Install Visual Studio with the "Desktop Development with C++" workload and
-   the "Python Development" workload.
-#. Install GnuWin32, making sure ``<GnuWin32 install dir>\bin`` is added to
-   your PATH environment variable. Verify that utilities like ``dirname`` and
-   ``make`` are available from your terminal.
-#. Install SWIG for Windows, making sure ``<SWIG install dir>`` is added to
-   your PATH environment variable. Verify that ``swig`` is available from your
-   terminal.
-#. Install Python 3 from the standalone installer and include the debug libraries
-   in the install, making sure the Python install path is added to your PATH
-   environment variable.
-#. Register the Debug Interface Access DLLs with the Registry from a privileged
+The steps outlined here describe how to set up your system and install the
+required dependencies for building and testing LLDB on Windows. They only need
+to be performed once.
+
+Build Requirements
+^^^^^^^^^^^^^^^^^^
+
+Please follow the steps below if you only want to **build** lldb.
+
+1. Install `Visual Studio <https://visualstudio.microsoft.com>` with the
+   "Desktop Development with C++" workload. Make sure that the latest Windows
+   SDK and the Active Template Library (ATL) are installed.
+2. Install `Git Bash <https://git-scm.com/install/windows>`_ and add
+   ``<Git install dir>\usr\bin`` to your ``PATH``. Verify that utilities like
+   ``dirname`` are available from your terminal.
+3. Install `make <https://sourceforge.net/projects/ezwinports/files/>`_ and
+   verify that it's in your ``PATH``.
+4. Install `Python 3 <https://www.python.org/downloads/windows/>`_ from the
+   GUI installer. If you will be building LLDB in Debug mode, **include the
+   debug libraries** during the install. Make sure ``python`` is added to your
+   ``PATH``.
+5. Install `SWIG for Windows <http://www.swig.org/download.html>`_. Make sure
+   ``swig`` is added to your ``PATH`` and that ``swig -swiglib`` points to the
+   correct directory.
+6. Register the Debug Interface Access DLLs with the Registry from a privileged
    terminal.
 
 ::
@@ -139,6 +133,16 @@ Prompt for VS <https://docs.microsoft.com/en-us/visualstudio/ide/reference/comma
 corresponding to the version you wish to use or run ``vcvarsall.bat`` or
 ``VsDevCmd.bat``.
 
+Test Requirements
+^^^^^^^^^^^^^^^^^
+
+Please follow the steps above and below if you want to **test** `lldb`.
+
+* Install `Python Tools for Visual Studio <https://github.com/Microsoft/PTVS/>`_,
+  an indispensable debugging extension to Visual Studio which enables full
+  editing and debugging support for Python (including mixed native/managed
+  debugging).
+
 macOS
 *****
 
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index e72ffd1..09939e2 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -386,7 +386,9 @@ ifeq (,$(filter 1, $(USE_LIBSTDCPP) $(USE_LIBCPP) $(USE_SYSTEM_STDLIB)))
     ifneq "$(LIBCPP_INCLUDE_TARGET_DIR)" ""
       CXXFLAGS += -cxx-isystem $(LIBCPP_INCLUDE_TARGET_DIR)
     endif
-    LDFLAGS += -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
+
+	# If `-nostdlib++` is not passed, clang will link to the system's stdlib.
+    LDFLAGS += -nostdlib++ -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
   else
     USE_SYSTEM_STDLIB := 1
   endif
@@ -407,7 +409,8 @@ ifeq (1,$(USE_LIBCPP))
 		ifneq "$(LIBCPP_INCLUDE_TARGET_DIR)" ""
 				CXXFLAGS += -cxx-isystem $(LIBCPP_INCLUDE_TARGET_DIR)
 		endif
-		LDFLAGS += -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
+		# If `-nostdlib++` is not passed, clang will link to the system's stdlib.
+		LDFLAGS += -nostdlib++ -L$(LIBCPP_LIBRARY_DIR) -Wl,-rpath,$(LIBCPP_LIBRARY_DIR) -lc++
 	else
 		ifeq "$(OS)" "Android"
 				# Nothing to do, this is already handled in
diff --git a/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py
index 13437d0..a73322c 100644
--- a/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py
+++ b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py
@@ -24,7 +24,7 @@ class TestDriverWithClosedSTDIO(TestBase):
 
     # Windows doesn't have the fcntl module, so we can't run this
     # test there.
-    @skipIf(oslist=["windows"])
+    @skipIf(hostoslist=["windows"])
     def test_run_lldb_and_wait(self):
         """This test forks, closes the stdio channels and exec's lldb.
         Then it waits for it to exit and asserts it did that successfully"""
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 5ad6288..71055dd16 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -161,7 +161,7 @@ private:
 
 public:
   /// Default constructor creates empty storage (invalid state)
-  VocabStorage() : Sections(), TotalSize(0), Dimension(0) {}
+  VocabStorage() = default;
 
   /// Create a VocabStorage with pre-organized section data
   VocabStorage(std::vector<std::vector<Embedding>> &&SectionData);
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 76b6c8e..e8dbc96 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -594,12 +594,13 @@ public:
 
     // Check if suitable for a bit test
     if (N <= DL.getIndexSizeInBits(0u)) {
-      SmallPtrSet<const BasicBlock *, 4> Dests;
-      for (auto I : SI.cases())
-        Dests.insert(I.getCaseSuccessor());
+      DenseMap<const BasicBlock *, unsigned int> DestMap;
+      for (auto I : SI.cases()) {
+        const BasicBlock *BB = I.getCaseSuccessor();
+        ++DestMap[BB];
+      }
 
-      if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
-                                     DL))
+      if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL))
         return 1;
     }
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b0601eb..36cb90b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -640,7 +640,8 @@ public:
   /// This variant does not erase \p MI after calling the build function.
   void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  bool matchOrShiftToFunnelShift(MachineInstr &MI, BuildFnTy &MatchInfo) const;
+  bool matchOrShiftToFunnelShift(MachineInstr &MI, bool AllowScalarConstants,
+                                 BuildFnTy &MatchInfo) const;
   bool matchFunnelShiftToRotate(MachineInstr &MI) const;
   void applyFunnelShiftToRotate(MachineInstr &MI) const;
   bool matchRotateOutOfRange(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index a9e53ba..f980d3d 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -84,6 +84,10 @@ LLVM_ABI Libcall getSINCOS(EVT RetVT);
 /// UNKNOWN_LIBCALL if there is none.
 LLVM_ABI Libcall getSINCOSPI(EVT RetVT);
 
+/// Return the SINCOS_STRET_ value for the given types, or UNKNOWN_LIBCALL if
+/// there is none.
+LLVM_ABI Libcall getSINCOS_STRET(EVT RetVT);
+
 /// getMODF - Return the MODF_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 LLVM_ABI Libcall getMODF(EVT RetVT);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d6ed3a8..1920b98 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1433,9 +1433,9 @@ public:
   /// \p High as its lowest and highest case values, and expects \p NumCmps
   /// case value comparisons. Check if the number of destinations, comparison
   /// metric, and range are all suitable.
-  bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
-                             const APInt &Low, const APInt &High,
-                             const DataLayout &DL) const {
+  bool isSuitableForBitTests(
+      const DenseMap<const BasicBlock *, unsigned int> &DestCmps,
+      const APInt &Low, const APInt &High, const DataLayout &DL) const {
     // FIXME: I don't think NumCmps is the correct metric: a single case and a
     // range of cases both require only one branch to lower. Just looking at the
     // number of clusters and destinations should be enough to decide whether to
@@ -1446,6 +1446,20 @@ public:
     if (!rangeFitsInWord(Low, High, DL))
       return false;
 
+    unsigned NumDests = DestCmps.size();
+    unsigned NumCmps = 0;
+    unsigned int MaxBitTestEntry = 0;
+    for (auto &DestCmp : DestCmps) {
+      NumCmps += DestCmp.second;
+      if (DestCmp.second > MaxBitTestEntry)
+        MaxBitTestEntry = DestCmp.second;
+    }
+
+    // Comparisons might be cheaper for small number of comparisons, which can
+    // be Arch Target specific.
+    if (MaxBitTestEntry < getMinimumBitTestCmps())
+      return false;
+
     // Decide whether it's profitable to lower this range with bit tests. Each
     // destination requires a bit test and branch, and there is an overall range
     // check branch. For a small number of clusters, separate comparisons might
@@ -2055,6 +2069,9 @@ public:
 
   virtual bool isJumpTableRelative() const;
 
+  /// Retuen the minimum of largest number of comparisons in BitTest.
+  unsigned getMinimumBitTestCmps() const;
+
   /// If a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   Register getStackPointerRegisterToSaveRestore() const {
@@ -2577,6 +2594,9 @@ protected:
   /// Set to zero to generate unlimited jump tables.
   void setMaximumJumpTableSize(unsigned);
 
+  /// Set the minimum of largest of number of comparisons to generate BitTest.
+  void setMinimumBitTestCmps(unsigned Val);
+
   /// If set to a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   void setStackPointerRegisterToSaveRestore(Register R) {
@@ -3719,6 +3739,9 @@ private:
   /// backend supports.
   unsigned MinCmpXchgSizeInBits;
 
+  /// The minimum of largest number of comparisons to use bit test for switch.
+  unsigned MinimumBitTestCmps;
+
   /// This indicates if the target supports unaligned atomic operations.
   bool SupportsUnalignedAtomics;
 
@@ -3738,7 +3761,7 @@ private:
   /// register class is the largest legal super-reg register class of the
   /// register class of the specified type. e.g. On x86, i8, i16, and i32's
   /// representative class would be GR32.
-  const TargetRegisterClass *RepRegClassForVT[MVT::VALUETYPE_SIZE] = {0};
+  const TargetRegisterClass *RepRegClassForVT[MVT::VALUETYPE_SIZE] = {nullptr};
 
   /// This indicates the "cost" of the "representative" register class for each
   /// ValueType. The cost is used by the scheduler to approximate register
diff --git a/llvm/include/llvm/IR/AbstractCallSite.h b/llvm/include/llvm/IR/AbstractCallSite.h
index 9e24ae7..f431e1d 100644
--- a/llvm/include/llvm/IR/AbstractCallSite.h
+++ b/llvm/include/llvm/IR/AbstractCallSite.h
@@ -137,7 +137,7 @@ public:
 
   /// Return true if @p U is the use that defines the callee of this ACS.
   bool isCallee(const Use *U) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->isCallee(U);
 
     assert(!CI.ParameterEncoding.empty() &&
@@ -154,7 +154,7 @@ public:
 
   /// Return the number of parameters of the callee.
   unsigned getNumArgOperands() const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->arg_size();
     // Subtract 1 for the callee encoding.
     return CI.ParameterEncoding.size() - 1;
@@ -169,7 +169,7 @@ public:
   /// Return the operand index of the underlying instruction associated with
   /// the function parameter number @p ArgNo or -1 if there is none.
   int getCallArgOperandNo(unsigned ArgNo) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return ArgNo;
     // Add 1 for the callee encoding.
     return CI.ParameterEncoding[ArgNo + 1];
@@ -183,7 +183,7 @@ public:
   /// Return the operand of the underlying instruction associated with the
   /// function parameter number @p ArgNo or nullptr if there is none.
   Value *getCallArgOperand(unsigned ArgNo) const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->getArgOperand(ArgNo);
     // Add 1 for the callee encoding.
     return CI.ParameterEncoding[ArgNo + 1] >= 0
@@ -210,7 +210,7 @@ public:
 
   /// Return the pointer to function that is being called.
   Value *getCalledOperand() const {
-    if (isDirectCall())
+    if (!isCallbackCall())
       return CB->getCalledOperand();
     return CB->getArgOperand(getCallArgOperandNoForCallee());
   }
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index dacda0a..972a253 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2191,7 +2191,7 @@ public:
                       FMFSource);
   }
   Value *CreatePtrToAddr(Value *V, const Twine &Name = "") {
-    return CreateCast(Instruction::PtrToInt, V,
+    return CreateCast(Instruction::PtrToAddr, V,
                       BB->getDataLayout().getAddressType(V->getType()), Name);
   }
   Value *CreatePtrToInt(Value *V, Type *DestTy,
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 3b7077c..d6b8563 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -153,6 +153,8 @@ def int_dx_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrCon
 def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_umin : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_usum : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 49a182be..bc51fb6 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -122,6 +122,8 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
   def int_spv_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+  def int_spv_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+  def int_spv_wave_reduce_umin : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
   def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
@@ -136,7 +138,7 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
   def int_spv_sclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_nclamp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
-  // Create resource handle given the binding information. Returns a 
+  // Create resource handle given the binding information. Returns a
   // type appropriate for the kind of resource given the set id, binding id,
   // array size of the binding, as well as an index and an indicator
   // whether that index may be non-uniform.
diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h
index 232101a..4d387ba 100644
--- a/llvm/include/llvm/IR/Mangler.h
+++ b/llvm/include/llvm/IR/Mangler.h
@@ -80,8 +80,7 @@ getArm64ECDemangledFunctionName(StringRef Name);
 
 /// Check if an ARM64EC function name is mangled.
 bool inline isArm64ECMangledFunctionName(StringRef Name) {
-  return Name[0] == '#' ||
-         (Name[0] == '?' && Name.find("@$$h") != StringRef::npos);
+  return Name[0] == '#' || (Name[0] == '?' && Name.contains("@$$h"));
 }
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/Support/FormattedStream.h b/llvm/include/llvm/Support/FormattedStream.h
index 011a6ae..402cd3e 100644
--- a/llvm/include/llvm/Support/FormattedStream.h
+++ b/llvm/include/llvm/Support/FormattedStream.h
@@ -180,7 +180,8 @@ public:
     return *this;
   }
 
-  raw_ostream &changeColor(enum Colors Color, bool Bold, bool BG) override {
+  raw_ostream &changeColor(enum Colors Color, bool Bold = false,
+                           bool BG = false) override {
     if (colors_enabled()) {
       DisableScanScope S(this);
       raw_ostream::changeColor(Color, Bold, BG);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 47d5d68..119695e 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1013,10 +1013,18 @@ def extract_vec_elt_combines : GICombineGroup<[
 def funnel_shift_from_or_shift : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$info),
   (match (wip_match_opcode G_OR):$root,
-    [{ return Helper.matchOrShiftToFunnelShift(*${root}, ${info}); }]),
+    [{ return Helper.matchOrShiftToFunnelShift(*${root}, false, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])
 >;
 
+def funnel_shift_from_or_shift_constants_are_legal : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_OR):$root,
+    [{ return Helper.matchOrShiftToFunnelShift(*${root}, true, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])
+>;
+
+
 def funnel_shift_to_rotate : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_FSHL, G_FSHR):$root,
diff --git a/llvm/lib/Analysis/HeatUtils.cpp b/llvm/lib/Analysis/HeatUtils.cpp
index a1cc707..08e9428 100644
--- a/llvm/lib/Analysis/HeatUtils.cpp
+++ b/llvm/lib/Analysis/HeatUtils.cpp
@@ -64,10 +64,7 @@ std::string llvm::getHeatColor(uint64_t Freq, uint64_t MaxFreq) {
 }
 
 std::string llvm::getHeatColor(double Percent) {
-  if (Percent > 1.0)
-    Percent = 1.0;
-  if (Percent < 0.0)
-    Percent = 0.0;
+  Percent = std::clamp(Percent, 0.0, 1.0);
   unsigned ColorID = unsigned(round(Percent * (HeatSize - 1.0)));
   return HeatPalette[ColorID];
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1f10478..9ace7d6 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4425,6 +4425,7 @@ void CombinerHelper::applyBuildFnNoErase(
 }
 
 bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
+                                               bool AllowScalarConstants,
                                                BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_OR);
 
@@ -4444,31 +4445,29 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
 
   // Given constants C0 and C1 such that C0 + C1 is bit-width:
   // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1)
-  int64_t CstShlAmt, CstLShrAmt;
+  int64_t CstShlAmt = 0, CstLShrAmt;
   if (mi_match(ShlAmt, MRI, m_ICstOrSplat(CstShlAmt)) &&
       mi_match(LShrAmt, MRI, m_ICstOrSplat(CstLShrAmt)) &&
       CstShlAmt + CstLShrAmt == BitWidth) {
     FshOpc = TargetOpcode::G_FSHR;
     Amt = LShrAmt;
-
   } else if (mi_match(LShrAmt, MRI,
                       m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
              ShlAmt == Amt) {
     // (or (shl x, amt), (lshr y, (sub bw, amt))) -> (fshl x, y, amt)
     FshOpc = TargetOpcode::G_FSHL;
-
   } else if (mi_match(ShlAmt, MRI,
                       m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
              LShrAmt == Amt) {
     // (or (shl x, (sub bw, amt)), (lshr y, amt)) -> (fshr x, y, amt)
     FshOpc = TargetOpcode::G_FSHR;
-
   } else {
     return false;
   }
 
   LLT AmtTy = MRI.getType(Amt);
-  if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}))
+  if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}) &&
+      (!AllowScalarConstants || CstShlAmt == 0 || !Ty.isScalar()))
     return false;
 
   MatchInfo = [=](MachineIRBuilder &B) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5fb7e63..431a810 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2400,10 +2400,11 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Results.push_back(Rem);
 }
 
-/// Return true if sincos libcall is available.
+/// Return true if sincos or __sincos_stret libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
-  RTLIB::Libcall LC = RTLIB::getSINCOS(Node->getSimpleValueType(0).SimpleTy);
-  return TLI.getLibcallName(LC) != nullptr;
+  MVT::SimpleValueType VT = Node->getSimpleValueType(0).SimpleTy;
+  return TLI.getLibcallImpl(RTLIB::getSINCOS(VT)) != RTLIB::Unsupported ||
+         TLI.getLibcallImpl(RTLIB::getSINCOS_STRET(VT)) != RTLIB::Unsupported;
 }
 
 /// Only issue sincos libcall if both sin and cos are needed.
@@ -3752,9 +3753,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT VT = Node->getValueType(0);
     // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
     // fcos which share the same operand and both are used.
-    if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
-         isSinCosLibcallAvailable(Node, TLI))
-        && useSinCos(Node)) {
+    if ((TLI.isOperationLegal(ISD::FSINCOS, VT) ||
+         isSinCosLibcallAvailable(Node, TLI)) &&
+        useSinCos(Node)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
       if (Node->getOpcode() == ISD::FCOS)
diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
index 038c499..3fa8243 100644
--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -198,7 +198,6 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
   assert(First <= Last);
 
   auto Prob = BranchProbability::getZero();
-  unsigned NumCmps = 0;
   std::vector<MachineBasicBlock*> Table;
   DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;
 
@@ -206,12 +205,16 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
   for (unsigned I = First; I <= Last; ++I)
     JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
 
+  DenseMap<const BasicBlock *, unsigned int> DestMap;
   for (unsigned I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
     Prob += Clusters[I].Prob;
     const APInt &Low = Clusters[I].Low->getValue();
     const APInt &High = Clusters[I].High->getValue();
-    NumCmps += (Low == High) ? 1 : 2;
+    unsigned int NumCmp = (Low == High) ? 1 : 2;
+    const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+    DestMap[BB] += NumCmp;
+
     if (I != First) {
       // Fill the gap between this and the previous cluster.
       const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
@@ -226,9 +229,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
     JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
   }
 
-  unsigned NumDests = JTProbs.size();
-  if (TLI->isSuitableForBitTests(NumDests, NumCmps,
-                                 Clusters[First].Low->getValue(),
+  if (TLI->isSuitableForBitTests(DestMap, Clusters[First].Low->getValue(),
                                  Clusters[Last].High->getValue(), *DL)) {
     // Clusters[First..Last] should be lowered as bit tests instead.
     return false;
@@ -372,20 +373,19 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters,
   if (First == Last)
     return false;
 
-  BitVector Dests(FuncInfo.MF->getNumBlockIDs());
-  unsigned NumCmps = 0;
+  DenseMap<const BasicBlock *, unsigned int> DestMap;
   for (int64_t I = First; I <= Last; ++I) {
     assert(Clusters[I].Kind == CC_Range);
-    Dests.set(Clusters[I].MBB->getNumber());
-    NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+    unsigned NumCmp = (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+    const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+    DestMap[BB] += NumCmp;
   }
-  unsigned NumDests = Dests.count();
 
   APInt Low = Clusters[First].Low->getValue();
   APInt High = Clusters[Last].High->getValue();
   assert(Low.slt(High));
 
-  if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL))
+  if (!TLI->isSuitableForBitTests(DestMap, Low, High, *DL))
     return false;
 
   APInt LowBound;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 59798b3..b3535eac 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -90,6 +91,11 @@ static cl::opt<unsigned> OptsizeJumpTableDensity(
     cl::desc("Minimum density for building a jump table in "
              "an optsize function"));
 
+static cl::opt<unsigned> MinimumBitTestCmpsOverride(
+    "min-bit-test-cmps", cl::init(2), cl::Hidden,
+    cl::desc("Set minimum of largest number of comparisons "
+             "to use bit test for switch."));
+
 // FIXME: This option is only to test if the strict fp operation processed
 // correctly by preventing mutating strict fp operation to normal fp operation
 // during development. When the backend supports strict float operation, this
@@ -428,6 +434,11 @@ RTLIB::Libcall RTLIB::getSINCOSPI(EVT RetVT) {
                       SINCOSPI_F128, SINCOSPI_PPCF128);
 }
 
+RTLIB::Libcall RTLIB::getSINCOS_STRET(EVT RetVT) {
+  return getFPLibCall(RetVT, SINCOS_STRET_F32, SINCOS_STRET_F64,
+                      UNKNOWN_LIBCALL, UNKNOWN_LIBCALL, UNKNOWN_LIBCALL);
+}
+
 RTLIB::Libcall RTLIB::getMODF(EVT RetVT) {
   return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128,
                       MODF_PPCF128);
@@ -719,6 +730,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
 
   MinCmpXchgSizeInBits = 0;
   SupportsUnalignedAtomics = false;
+
+  MinimumBitTestCmps = MinimumBitTestCmpsOverride;
 }
 
 // Define the virtual destructor out-of-line to act as a key method to anchor
@@ -2129,6 +2142,14 @@ bool TargetLoweringBase::isJumpTableRelative() const {
   return getTargetMachine().isPositionIndependent();
 }
 
+unsigned TargetLoweringBase::getMinimumBitTestCmps() const {
+  return MinimumBitTestCmps;
+}
+
+void TargetLoweringBase::setMinimumBitTestCmps(unsigned Val) {
+  MinimumBitTestCmps = Val;
+}
+
 Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
   if (TM.Options.LoopAlignment)
     return Align(TM.Options.LoopAlignment);
diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index 93ff3b9..d87cb4d 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -552,7 +552,7 @@ llvm::Error GsymCreator::saveSegments(StringRef Path,
         createSegment(SegmentSize, FuncIdx);
     if (ExpectedGC) {
       GsymCreator *GC = ExpectedGC->get();
-      if (GC == NULL)
+      if (!GC)
         break; // We had not more functions to encode.
       // Don't collect any messages at all
       OutputAggregator Out(nullptr);
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
index 1a61d31..e609a7d 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
@@ -55,7 +55,7 @@ struct PerfState {
   std::unique_ptr<raw_fd_ostream> Dumpstream;
 
   // perf mmap marker
-  void *MarkerAddr = NULL;
+  void *MarkerAddr = nullptr;
 };
 
 // prevent concurrent dumps from messing up the output file
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index 70ac68a..fb6ff62 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -443,7 +443,7 @@ Argument *Context::getOrCreateArgument(llvm::Argument *LLVMArg) {
 }
 
 Constant *Context::getOrCreateConstant(llvm::Constant *LLVMC) {
-  return cast<Constant>(getOrCreateValueInternal(LLVMC, 0));
+  return cast<Constant>(getOrCreateValueInternal(LLVMC, nullptr));
 }
 
 BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index b3ec65c..2783147 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -366,6 +366,7 @@ def AArch64PostLegalizerCombiner
                         select_to_minmax, or_to_bsp, combine_concat_vector,
                         commute_constant_to_rhs, extract_vec_elt_combines,
                         push_freeze_to_prevent_poison_from_propagating,
-                        combine_mul_cmlt, combine_use_vector_truncate, 
-                        extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> {
+                        combine_mul_cmlt, combine_use_vector_truncate,
+                        extmultomull, truncsat_combines, lshr_of_trunc_of_lshr,
+                        funnel_shift_from_or_shift_constants_are_legal]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d16b116..60aa61e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9028,11 +9028,12 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   CallingConv::ID CallerCC = CallerF.getCallingConv();
 
   // SME Streaming functions are not eligible for TCO as they may require
-  // the streaming mode or ZA to be restored after returning from the call.
+  // the streaming mode or ZA/ZT0 to be restored after returning from the call.
   SMECallAttrs CallAttrs =
       getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
   if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
       CallAttrs.requiresPreservingAllZAState() ||
+      CallAttrs.requiresPreservingZT0() ||
       CallAttrs.caller().hasStreamingBody())
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ce2b4a5..cd8b249 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,9 +562,13 @@ public:
 void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
 extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
 
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
 struct AMDGPUUniformIntrinsicCombinePass
     : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 0eb00cb..529da8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
 struct ImageDimIntrinsicInfo {
   unsigned Intr;
   unsigned BaseOpcode;
+  unsigned AtomicNoRetBaseOpcode;
   MIMGDim Dim;
 
   uint8_t NumOffsetArgs;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 97c2c9c..9ce1224 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
+  unsigned IntrOpcode = Intr->BaseOpcode;
+
+  // For image atomic: use no-return opcode if result is unused.
+  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
+    Register ResultDef = MI.getOperand(0).getReg();
+    if (MRI->use_nodbg_empty(ResultDef))
+      IntrOpcode = Intr->AtomicNoRetBaseOpcode;
+  }
 
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-    AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
 
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  unsigned IntrOpcode = Intr->BaseOpcode;
   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
 
   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
 
-  Register VDataIn, VDataOut;
+  Register VDataIn = AMDGPU::NoRegister;
+  Register VDataOut = AMDGPU::NoRegister;
   LLT VDataTy;
   int NumVDataDwords = -1;
   bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
@@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   unsigned DMaskLanes = 0;
 
   if (BaseOpcode->Atomic) {
-    VDataOut = MI.getOperand(0).getReg();
+    if (!BaseOpcode->NoReturn)
+      VDataOut = MI.getOperand(0).getReg();
     VDataIn = MI.getOperand(2).getReg();
     LLT Ty = MRI->getType(VDataIn);
 
@@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
 
   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+    CPol |= AMDGPU::CPol::GLC;
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a6074ea..bf6f1a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
@@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6214f4d..75a94ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
   initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
   initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
+  initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         if (EarlyInlineAll && !EnableFunctionCalls)
           PM.addPass(AMDGPUAlwaysInlinePass());
-
-        if (EnableUniformIntrinsicCombine)
-          PM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerPeepholeEPCallback(
@@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         FPM.addPass(AMDGPUUseNativeCallsPass());
         if (EnableLibCallSimplify)
           FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+        if (EnableUniformIntrinsicCombine)
+          FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerCGSCCOptimizerLateEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 50c78d8..65e6ed9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -16,12 +16,6 @@
 /// uniformity. And every instruction that's downstream and cares about dynamic
 /// uniformity must be convergent (and isel will introduce v_readfirstlane for
 /// them if their operands can't be proven statically uniform).
-///
-/// This pass is implemented as a ModulePass because intrinsic declarations
-/// exist at the module scope, allowing us to skip processing entirely if no
-/// declarations are present and to traverse their user lists directly when
-/// they are. A FunctionPass would instead require scanning every instruction
-/// in every function to find relevant intrinsics, which is far less efficient.
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
           Tracker[NotOp] = true; // NOT preserves uniformity
           LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
           ICmp->replaceAllUsesWith(NotOp);
-          ICmp->eraseFromParent();
           Changed = true;
         } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
           // Case: (icmp ne %ballot, 0) -> %ballot_arg
           LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
                             << *Src << '\n');
           ICmp->replaceAllUsesWith(Src);
-          ICmp->eraseFromParent();
           Changed = true;
         }
       }
@@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
   return false;
 }
 
-/// Iterates over intrinsic declarations in the module to optimize their uses.
-static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+/// Iterates over intrinsic calls in the Function to optimize.
+static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
   bool IsChanged = false;
   ValueMap<const Value *, bool> Tracker;
 
-  FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  for (Function &F : M) {
-    switch (F.getIntrinsicID()) {
+  for (Instruction &I : make_early_inc_range(instructions(F))) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II)
+      continue;
+
+    switch (II->getIntrinsicID()) {
     case Intrinsic::amdgcn_permlane64:
     case Intrinsic::amdgcn_readfirstlane:
     case Intrinsic::amdgcn_readlane:
@@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
     default:
       continue;
     }
-
-    for (User *U : make_early_inc_range(F.users())) {
-      auto *II = cast<IntrinsicInst>(U);
-      Function *ParentF = II->getFunction();
-      const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
-      IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
-    }
+    IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
   }
   return IsChanged;
 }
 
 PreservedAnalyses
-AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
-  if (!runUniformIntrinsicCombine(M, AM))
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
+  if (!runUniformIntrinsicCombine(F, UI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserve<UniformityInfoAnalysis>();
   return PA;
 }
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
+public:
+  static char ID;
+  AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
+    initializeAMDGPUUniformIntrinsicCombineLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<UniformityInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+  }
+};
+} // namespace
+
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+    AMDGPUUniformIntrinsicCombineLegacy::ID;
+
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  const UniformityInfo &UI =
+      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  return runUniformIntrinsicCombine(F, UI);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+                      "AMDGPU Uniform Intrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+                    "AMDGPU Uniform Intrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+  return new AMDGPUUniformIntrinsicCombineLegacy();
+}
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 5f6d742..d950131 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
 }
 
 class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc,
-                                RegisterClass addr_rc, string dns="">
-  : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
-  let Constraints = "$vdst = $vdata";
-
+                                RegisterClass addr_rc, bit noRtn, string dns="">
+  : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> {
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
-  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
 }
 
 class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc,
-                               RegisterClass addr_rc, string dns="">
-  : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> {
-  let Constraints = "$vdst = $vdata";
-
+                               RegisterClass addr_rc, bit noRtn, string dns="">
+  : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> {
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata,
                            addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, LWE:$lwe, DA:$da);
-  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
 }
 
 class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc,
-                     RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
+                     RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn,
                              !if(enableDasm, "GFX6GFX7", "")> {
   let AssemblerPredicate = isGFX6GFX7;
 }
 
 class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc,
-                     RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
+                     RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> {
   let AssemblerPredicate = isGFX8GFX9NotGFX90A;
   let MIMGEncoding = MIMGEncGfx8;
 }
 
 class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc,
-                         RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+                         RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> {
   let AssemblerPredicate = isGFX90APlus;
   let MIMGEncoding = MIMGEncGfx90a;
 }
 
 class MIMG_Atomic_gfx10<mimgopc op, string opcode,
                         RegisterOperand DataRC, RegisterClass AddrRC,
-                        bit enableDisasm = 0>
-  : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
+                        bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)),
                !if(enableDisasm, "GFX10", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
-  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
 }
 
 class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
                             RegisterOperand DataRC, int num_addrs,
-                            bit enableDisasm = 0>
-  : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
+                            bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                    !if(enableDisasm, "GFX10", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
 
 class MIMG_Atomic_gfx11<mimgopc op, string opcode,
                         RegisterOperand DataRC, RegisterClass AddrRC,
-                        bit enableDisasm = 0>
-  : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
+                        bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)),
                !if(enableDisasm, "GFX11", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
-  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
 }
 
 class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
                             RegisterOperand DataRC, int num_addrs,
-                            bit enableDisasm = 0>
-  : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
+                            bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                    !if(enableDisasm, "GFX11", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
 }
 
 class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
-                          int num_addrs, string renamed, bit enableDisasm = 0>
-  : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
+                          int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0>
+  : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                   !if(enableDisasm, "GFX12", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
@@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
                                       RegisterOperand data_rc,
                                       bit enableDasm = 0,
                                       bit isFP = 0,
+                                      bit noRtn = 0,
                                       string renamed = ""> {
   let hasSideEffects = 1, // FIXME: remove this
       mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
-      FPAtomic = isFP in {
+      FPAtomic = isFP, IsAtomicNoRet = noRtn in {
     let VAddrDwords = 1 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_VI then {
-          def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
           let hasPostISelHook = 1 in
-          def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_GFX10M then {
-          def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_GFX11 then {
-          def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>;
+        def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>;
       }
     }
     let VAddrDwords = 2 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+          def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
-          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>;
+          def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
-          def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+          def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>;
         }
         if op.HAS_GFX11 then {
-          def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
-          def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+          def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>;
+        def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>;
       }
     }
     let VAddrDwords = 3 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+          def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
-          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>;
+          def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
-          def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+          def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>;
         }
         if op.HAS_GFX11 then {
-          def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
-          def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+          def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>;
+        def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>;
       }
     }
     let VAddrDwords = 4 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+          def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
-          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>;
+          def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
-          def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+          def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>;
         }
         if op.HAS_GFX11 then {
-          def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
-          def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+          def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>;
+        def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>;
       }
     }
   }
@@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
     }
 }
 
-multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
-                        string renamed = ""> { // 64-bit atomics
-  let IsAtomicRet = 1 in {
+multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+                        bit noRtn = 0, string renamed = ""> { // 64-bit atomics
+  let IsAtomicRet = !not(noRtn) in {
     def "" : MIMGBaseOpcode {
       let Atomic = 1;
       let AtomicX2 = isCmpSwap;
+      let NoReturn = noRtn;
     }
 
     let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
@@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
       // Other variants are reconstructed by disassembler using dmask and tfe.
       if !not(isCmpSwap) then {
         let VDataDwords = 1 in
-        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>;
+        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>;
       }
 
       let VDataDwords = 2 in
-      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>;
+      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>;
       let VDataDwords = 3 in
-      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>;
+      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>;
 
       if isCmpSwap then {
         let VDataDwords = 4 in
-        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>;
+        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>;
         let VDataDwords = 5 in
-        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>;
+        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>;
       }
     }
-  } // End IsAtomicRet = 1
+  }
+}
+
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+                        string renamed = ""> {
+  defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>;
+  defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>;
 }
 
 multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
@@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in {
 class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   Intrinsic Intr = I;
   MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+  MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode;
   AMDGPUDimProps Dim = I.P.Dim;
   AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
 
@@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
 }
 
+class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I>
+  : ImageDimIntrinsicInfo<I> {
+  MIMGBaseOpcode AtomicNoRetBaseOpcode =
+    !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN"));
+}
+
 def ImageDimIntrinsicTable : GenericTable {
   let FilterClass = "ImageDimIntrinsicInfo";
-  let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
-    "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
-    "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
+  let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData",
+    "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex",
+    "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
     "BiasTyArg", "GradientTyArg", "CoordTyArg"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+  string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode";
   string TypeOf_Dim = "MIMGDim";
 
   let PrimaryKey = ["Intr"];
@@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex {
   let Key = ["BaseOpcode", "Dim"];
 }
 
-foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
-                           AMDGPUImageDimAtomicIntrinsics) in {
+foreach intr = AMDGPUImageDimIntrinsics in {
   def : ImageDimIntrinsicInfo<intr>;
 }
 
+foreach intr = AMDGPUImageDimAtomicIntrinsics in {
+  def : ImageDimAtomicIntrinsicInfo<intr>;
+}
+
 // L to LZ Optimization Mapping
 def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>;
 def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index be42291..b34ab2a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   SDLoc DL(Op);
   MachineFunction &MF = DAG.getMachineFunction();
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  unsigned IntrOpcode = Intr->BaseOpcode;
+  // For image atomic: use no-return opcode if result is unused.
+  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
+      !Op.getNode()->hasAnyUseOfValue(0))
+    IntrOpcode = Intr->AtomicNoRetBaseOpcode;
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-      AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  unsigned IntrOpcode = Intr->BaseOpcode;
   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
   bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
   bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
 
   SmallVector<EVT, 3> ResultTypes(Op->values());
   SmallVector<EVT, 3> OrigResultTypes(Op->values());
+  if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
+    ResultTypes.erase(&ResultTypes[0]);
+
   bool IsD16 = false;
   bool IsG16 = false;
   bool IsA16 = false;
@@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     VData = Op.getOperand(2);
 
     IsAtomicPacked16Bit =
-        (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
-         Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+        (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
 
     bool Is64Bit = VData.getValueSizeInBits() == 64;
     if (BaseOpcode->AtomicX2) {
@@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       if (Is64Bit)
         VData = DAG.getBitcast(MVT::v4i32, VData);
 
-      ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+      if (!BaseOpcode->NoReturn)
+        ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+
       DMask = Is64Bit ? 0xf : 0x3;
       NumVDataDwords = Is64Bit ? 4 : 2;
     } else {
@@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   }
 
   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+    CPol |= AMDGPU::CPol::GLC;
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return Op;
@@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     DAG.setNodeMemRefs(NewNode, {MemRef});
   }
 
+  if (BaseOpcode->NoReturn) {
+    if (BaseOpcode->Atomic)
+      return DAG.getMergeValues(
+          {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
+
+    return SDValue(NewNode, 0);
+  }
+
   if (BaseOpcode->AtomicX2) {
     SmallVector<SDValue, 1> Elt;
     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
   }
-  if (BaseOpcode->NoReturn)
-    return SDValue(NewNode, 0);
+
   return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
                            NumVDataDwords, IsAtomicPacked16Bit, DL);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 313ae3d..fdba454 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1298,12 +1298,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
-  // Use __sincos_stret if available.
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  }
+  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8Base()) {
@@ -9835,13 +9831,18 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin());
-
   // For iOS, we want to call an alternative entry point: __sincos_stret,
   // return values are passed via sret.
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  assert(Subtarget->isTargetDarwin());
+
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
@@ -9871,11 +9872,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
 
   Args.emplace_back(Arg, ArgTy);
 
-  RTLIB::Libcall LC =
-      (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = getLibcallName(LC);
-  CallingConv::ID CC = getLibcallCallingConv(LC);
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
+  StringRef LibcallName = getLibcallImplName(SincosStret);
+  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
+  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 44c4830..7ae500a 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1058,6 +1058,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
                    IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>,
                    IntrinArgI8<SignedOpKind_Unsigned>
                  ]>,
+    IntrinSelect<int_dx_wave_reduce_min,
+                 [
+                   IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>,
+                   IntrinArgI8<SignedOpKind_Signed>
+                 ]>,
+    IntrinSelect<int_dx_wave_reduce_umin,
+                 [
+                   IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Min>,
+                   IntrinArgI8<SignedOpKind_Unsigned>
+                 ]>,
   ];
 
   let arguments = [OverloadTy, Int8Ty, Int8Ty];
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index e7e7f2c..ce6e812 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -94,6 +94,8 @@ static bool checkWaveOps(Intrinsic::ID IID) {
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_wave_reduce_max:
   case Intrinsic::dx_wave_reduce_umax:
+  case Intrinsic::dx_wave_reduce_min:
+  case Intrinsic::dx_wave_reduce_umin:
     return true;
   }
 }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 68fd3e0..60dfd96 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -55,8 +55,10 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_splitdouble:
   case Intrinsic::dx_wave_readlane:
   case Intrinsic::dx_wave_reduce_max:
+  case Intrinsic::dx_wave_reduce_min:
   case Intrinsic::dx_wave_reduce_sum:
   case Intrinsic::dx_wave_reduce_umax:
+  case Intrinsic::dx_wave_reduce_umin:
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_imad:
   case Intrinsic::dx_umad:
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 54c8972..0573f64 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1061,8 +1061,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
   SDValue W0 = isUndef(PredV)
                   ? DAG.getUNDEF(MVT::i64)
                   : DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV);
-  Words[IdxW].push_back(HiHalf(W0, DAG));
-  Words[IdxW].push_back(LoHalf(W0, DAG));
+  if (Bytes < BitBytes) {
+    Words[IdxW].push_back(HiHalf(W0, DAG));
+    Words[IdxW].push_back(LoHalf(W0, DAG));
+  } else
+    Words[IdxW].push_back(W0);
 
   while (Bytes < BitBytes) {
     IdxW ^= 1;
@@ -1083,7 +1086,26 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
     Bytes *= 2;
   }
 
+  while (Bytes > BitBytes) {
+    IdxW ^= 1;
+    Words[IdxW].clear();
+
+    if (Bytes <= 4) {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        SDValue T = contractPredicate(W, dl, DAG);
+        Words[IdxW].push_back(T);
+      }
+    } else {
+      for (const SDValue &W : Words[IdxW ^ 1]) {
+        Words[IdxW].push_back(W);
+      }
+    }
+    Bytes /= 2;
+  }
+
   assert(Bytes == BitBytes);
+  if (BitBytes == 1 && PredTy == MVT::v2i1)
+    ByteTy = MVT::getVectorVT(MVT::i16, HwLen);
 
   SDValue Vec = ZeroFill ? getZero(dl, ByteTy, DAG) : DAG.getUNDEF(ByteTy);
   SDValue S4 = DAG.getConstant(HwLen-4, dl, MVT::i32);
@@ -3157,6 +3179,9 @@ SDValue
 HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
   auto *MemN = cast<MemSDNode>(Op.getNode());
 
+  if (!MemN->getMemoryVT().isSimple())
+    return Op;
+
   MVT MemTy = MemN->getMemoryVT().getSimpleVT();
   if (!isHvxPairTy(MemTy))
     return Op;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 4029e14..729c077 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -493,7 +493,7 @@ NVPTXTTIImpl::getInstructionCost(const User *U,
             // predicate ("@").
             return !AsmInst.empty() &&
                    (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
-                    AsmInst.find(".pragma") != StringRef::npos);
+                    AsmInst.contains(".pragma"));
           });
       return InstCount * TargetTransformInfo::TCC_Basic;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 17f04d0..20fc849 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries(
     "ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
     cl::desc("Set minimum number of entries to use a jump table on PPC"));
 
+static cl::opt<unsigned> PPCMinimumBitTestCmps(
+    "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden,
+    cl::desc("Set minimum of largest number of comparisons to use bit test for "
+             "switch on PPC."));
+
 static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
     "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
     cl::desc("max depth when checking alias info in GatherAllAliases()"));
@@ -1436,6 +1441,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // Re-evaluate this value on future HWs that can do better with mtctr.
   setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
 
+  // The default minimum of largest number in a BitTest cluster is 3.
+  setMinimumBitTestCmps(PPCMinimumBitTestCmps);
+
   setMinFunctionAlignment(Align(4));
   setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index b0bed71c..da3efdc 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -194,6 +194,22 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31} = XT{5};
 }
 
+class XForm_RBS5<bits<6> opCode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+    : I<opCode, OOL, IOL, asmstr, itin> {
+
+  bits<5> RB;
+  bits<5> RS;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = RS;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
+  let Inst{31} = 0;
+}
+
 class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr,
                        list<dag> pattern>
     : I<59, OOL, IOL, asmstr, NoItinerary> {
@@ -317,12 +333,16 @@ let Predicates = [IsISAFuture] in {
   def TLBIEIO
       : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC),
                          "tlbieio $RB, $RS, $RIC", []>;
+  def MTLPL : XForm_RBS5<31, 275, (outs), (ins gprc:$RB, gprc:$RS),
+                         "mtlpl $RB, $RS", IIC_SprMTSPR, []>;
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
     def TLBIEP8
         : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs),
                                   (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC,
                                       u1imm:$PRS, u1imm:$R),
                                   "tlbiep $RB, $RS, $RIC, $PRS, $R", []>;
+    def MTLPL8 : XForm_RBS5<31, 275, (outs), (ins g8rc:$RB, g8rc:$RS),
+                            "mtlpl $RB, $RS", IIC_SprMTSPR, []>, isPPC64;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 41a9c92..96e8afc 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -823,6 +823,7 @@ static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) {
     break;
   case RISCV::fixup_riscv_rvc_jump:
   case RISCV::fixup_riscv_rvc_branch:
+  case RISCV::fixup_riscv_rvc_imm:
   case RISCV::fixup_riscv_jal:
     return false;
   }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 6d587e6..5934c91 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -688,6 +688,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCI) {
       FixupKind = RISCV::fixup_riscv_rvc_imm;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcili);
     } else if (MIFrm == RISCVII::InstFormatI) {
       FixupKind = RISCV::fixup_riscv_12_i;
     } else if (MIFrm == RISCVII::InstFormatQC_EB) {
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 98b636e..9bd66a4 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -373,6 +373,26 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
         .addReg(ScratchReg)
         .addImm(-1);
     break;
+  case AtomicRMWInst::Max:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MAX), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Min:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MIN), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::UMax:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MAXU), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::UMin:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::MINU), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
   }
   BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)), ScratchReg)
       .addReg(ScratchReg)
@@ -682,6 +702,9 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
     MachineBasicBlock::iterator &NextMBBI) {
+  // Using MIN(U)/MAX(U) is preferrable if permitted
+  if (STI->hasPermissiveZalrsc() && STI->hasStdExtZbb() && !IsMasked)
+    return expandAtomicBinOp(MBB, MBBI, BinOp, IsMasked, Width, NextMBBI);
 
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 2754d78..b4556f6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1906,6 +1906,25 @@ def FeatureForcedAtomics : SubtargetFeature<
 def HasAtomicLdSt
     : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">;
 
+// The RISC-V Unprivileged Architecture - ISA Volume 1 (Version: 20250508)
+// [https://docs.riscv.org/reference/isa/_attachments/riscv-unprivileged.pdf]
+// in section 13.3. Eventual Success of Store-Conditional Instructions, defines
+// _constrained_ LR/SC loops:
+//   The dynamic code executed between the LR and SC instructions can only
+//   contain instructions from the base ''I'' instruction set, excluding loads,
+//   stores, backward jumps, taken backward branches, JALR, FENCE, and SYSTEM
+//   instructions. Compressed forms of the aforementioned ''I'' instructions in
+//   the Zca and Zcb extensions are also permitted.
+// LR/SC loops that do not adhere to the above are _unconstrained_ LR/SC loops,
+// and success is implementation specific. For implementations which know that
+// non-base instructions (such as the ''B'' extension) will not violate any
+// forward progress guarantees, using these instructions to reduce the LR/SC
+// sequence length is desirable.
+def FeaturePermissiveZalrsc
+    : SubtargetFeature<
+          "permissive-zalrsc", "HasPermissiveZalrsc", "true",
+          "Implementation permits non-base instructions between LR/SC pairs">;
+
 def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "AllowTaggedGlobals",
     "true", "Use an instruction sequence for taking the address of a global "
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 6181abb..47022b3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -745,7 +745,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
                  .addDef(ResVReg)
                  .addUse(getSPIRVTypeID(BaseType))
                  .addImm(static_cast<uint32_t>(Storage));
-  if (Init != 0)
+  if (Init)
     MIB.addUse(Init->getOperand(0).getReg());
   // ISel may introduce a new register on this step, so we need to add it to
   // DT and correct its type avoiding fails on the next stage.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 021353a..3fea21e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -222,6 +222,9 @@ private:
   bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I, bool IsUnsigned) const;
 
+  bool selectWaveReduceMin(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, bool IsUnsigned) const;
+
   bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I) const;
 
@@ -2456,6 +2459,35 @@ bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectWaveReduceMin(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I,
+                                                   bool IsUnsigned) const {
+  assert(I.getNumOperands() == 3);
+  assert(I.getOperand(2).isReg());
+  MachineBasicBlock &BB = *I.getParent();
+  Register InputRegister = I.getOperand(2).getReg();
+  SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister);
+
+  if (!InputType)
+    report_fatal_error("Input Type could not be determined.");
+
+  SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII);
+  // Retreive the operation to use based on input type
+  bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat);
+  auto IntegerOpcodeType =
+      IsUnsigned ? SPIRV::OpGroupNonUniformUMin : SPIRV::OpGroupNonUniformSMin;
+  auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMin : IntegerOpcodeType;
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII,
+                                     !STI.isShader()))
+      .addImm(SPIRV::GroupOperation::Reduce)
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I) const {
@@ -3431,6 +3463,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true);
   case Intrinsic::spv_wave_reduce_max:
     return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false);
+  case Intrinsic::spv_wave_reduce_umin:
+    return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ true);
+  case Intrinsic::spv_wave_reduce_min:
+    return selectWaveReduceMin(ResVReg, ResType, I, /*IsUnsigned*/ false);
   case Intrinsic::spv_wave_reduce_sum:
     return selectWaveReduceSum(ResVReg, ResType, I);
   case Intrinsic::spv_wave_readlane:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3da720f..58109ac 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -8973,8 +8973,7 @@ SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
         if (const auto *CB = dyn_cast<CallBase>(RHSVal)) {
           if (CB->isInlineAsm()) {
             const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
-            return IA &&
-                   IA->getConstraintString().find("{@cc}") != std::string::npos;
+            return IA && IA->getConstraintString().contains("{@cc}");
           }
         }
       }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 410f20e..5785440 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,11 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  }
+  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -33067,26 +33064,30 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  bool isF64 = ArgVT == MVT::f64;
+
+  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
+  const char *LibcallName = TLI.getLibcallName(LC);
+  if (!LibcallName)
+    return SDValue();
+
   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
   // { double, double } (which is returned in XMM0, XMM1).
   SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   Args.emplace_back(Arg, ArgTy);
 
-  bool isF64 = ArgVT == MVT::f64;
   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = TLI.getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
@@ -54634,6 +54635,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
   SDLoc DL(N);
 
   // Attempt to pre-truncate inputs to arithmetic ops instead.
@@ -54652,6 +54654,40 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
     return V;
 
+  // Fold trunc(srl(load(p),amt)) -> load(p+amt/8)
+  // If we're shifting down byte aligned bit chunks from a larger load for
+  // truncation, see if we can convert the shift into a pointer offset instead.
+  // Limit this to normal (non-ext) scalar integer loads.
+  if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
+      Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+      ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+    auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
+    if (Ld->isSimple() && VT.isByteSized() &&
+        isPowerOf2_64(VT.getSizeInBits())) {
+      SDValue ShAmt = Src.getOperand(1);
+      KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+      // Check the shift amount is byte aligned.
+      // Check the truncation doesn't use any shifted in (zero) top bits.
+      if (KnownAmt.countMinTrailingZeros() >= 3 &&
+          KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
+                                     VT.getSizeInBits())) {
+        EVT PtrVT = Ld->getBasePtr().getValueType();
+        SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
+        SDValue PtrByteOfs =
+            DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+                        DAG.getShiftAmountConstant(3, PtrVT, DL));
+        SDValue NewPtr = DAG.getMemBasePlusOffset(
+            Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
+        SDValue NewLoad =
+            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+                        Align(), Ld->getMemOperand()->getFlags());
+        DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
+                                      NewLoad.getValue(1));
+        return NewLoad;
+      }
+    }
+  }
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index c8d1938..0849fc7 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1179,7 +1179,7 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
                                                  const unsigned *Features,
                                                  unsigned *Type,
                                                  unsigned *Subtype) {
-  const char *CPU = 0;
+  const char *CPU = nullptr;
 
   switch (Family) {
   case 4:
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 042578d..6a11aec 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -380,7 +380,7 @@ bool ExpandVariadics::runOnModule(Module &M) {
           if (CB->isIndirectCall()) {
             FunctionType *FTy = CB->getFunctionType();
             if (FTy->isVarArg())
-              Changed |= expandCall(M, Builder, CB, FTy, 0);
+              Changed |= expandCall(M, Builder, CB, FTy, /*NF=*/nullptr);
           }
         }
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8d9933b..92fca90 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3496,7 +3496,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       if (isPowerOf2_64(AlignMask + 1)) {
         uint64_t Offset = 0;
         match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
-        if (match(A, m_PtrToInt(m_Value(A)))) {
+        if (match(A, m_PtrToIntOrAddr(m_Value(A)))) {
           /// Note: this doesn't preserve the offset information but merges
           /// offset and alignment.
           /// TODO: we can generate a GEP instead of merging the alignment with
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f939e7a..614c6eb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2148,7 +2148,7 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
   return nullptr;
 }
 
-Value *InstCombinerImpl::foldPtrToIntOfGEP(Type *IntTy, Value *Ptr) {
+Value *InstCombinerImpl::foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr) {
   // Look through chain of one-use GEPs.
   Type *PtrTy = Ptr->getType();
   SmallVector<GEPOperator *> GEPs;
@@ -2210,7 +2210,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
       Mask->getType() == Ty)
     return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
 
-  if (Value *V = foldPtrToIntOfGEP(Ty, SrcOp))
+  if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
     return replaceInstUsesWith(CI, V);
 
   Value *Vec, *Scalar, *Index;
@@ -2228,6 +2228,21 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
 }
 
 Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) {
+  Value *SrcOp = CI.getPointerOperand();
+  Type *Ty = CI.getType();
+
+  // (ptrtoaddr (ptrmask P, M))
+  //    -> (and (ptrtoaddr P), M)
+  // This is generally beneficial as `and` is better supported than `ptrmask`.
+  Value *Ptr, *Mask;
+  if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr),
+                                                            m_Value(Mask)))) &&
+      Mask->getType() == Ty)
+    return BinaryOperator::CreateAnd(Builder.CreatePtrToAddr(Ptr), Mask);
+
+  if (Value *V = foldPtrToIntOrAddrOfGEP(Ty, SrcOp))
+    return replaceInstUsesWith(CI, V);
+
   // FIXME: Implement variants of ptrtoint folds.
   return commonCastTransforms(CI);
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9c75d9a..d85e4f7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -700,7 +700,7 @@ public:
   /// folded operation.
   void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
 
-  Value *foldPtrToIntOfGEP(Type *IntTy, Value *Ptr);
+  Value *foldPtrToIntOrAddrOfGEP(Type *IntTy, Value *Ptr);
   Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond,
                            Instruction &I);
   Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS,
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c537be5c..4fac5d3 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5228,32 +5228,52 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
         CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
   }
 
-  // Create the new switch instruction now.
-  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
-  if (HasProfile) {
-    // We know the weight of the default case. We don't know the weight of the
-    // other cases, but rather than completely lose profiling info, we split
-    // the remaining probability equally over them.
-    SmallVector<uint32_t> NewWeights(Values.size() + 1);
-    NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if
-                                      // TrueWhenEqual.
-    for (auto &V : drop_begin(NewWeights))
-      V = BranchWeights[0] / Values.size();
-    setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
-  }
-
-  // Add all of the 'cases' to the switch instruction.
-  for (ConstantInt *Val : Values)
-    New->addCase(Val, EdgeBB);
-
-  // We added edges from PI to the EdgeBB.  As such, if there were any
-  // PHI nodes in EdgeBB, they need entries to be added corresponding to
-  // the number of edges added.
-  for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
-    PHINode *PN = cast<PHINode>(BBI);
-    Value *InVal = PN->getIncomingValueForBlock(BB);
-    for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
-      PN->addIncoming(InVal, BB);
+  // Check if we can represent the values as a contiguous range. If so, we use a
+  // range check + conditional branch instead of a switch.
+  if (Values.front()->getValue() - Values.back()->getValue() ==
+      Values.size() - 1) {
+    ConstantRange RangeToCheck = ConstantRange::getNonEmpty(
+        Values.back()->getValue(), Values.front()->getValue() + 1);
+    APInt Offset, RHS;
+    ICmpInst::Predicate Pred;
+    RangeToCheck.getEquivalentICmp(Pred, RHS, Offset);
+    Value *X = CompVal;
+    if (!Offset.isZero())
+      X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset));
+    Value *Cond =
+        Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS));
+    BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB);
+    if (HasProfile)
+      setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false);
+    // We don't need to update PHI nodes since we don't add any new edges.
+  } else {
+    // Create the new switch instruction now.
+    SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+    if (HasProfile) {
+      // We know the weight of the default case. We don't know the weight of the
+      // other cases, but rather than completely lose profiling info, we split
+      // the remaining probability equally over them.
+      SmallVector<uint32_t> NewWeights(Values.size() + 1);
+      NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped
+                                        // if TrueWhenEqual.
+      for (auto &V : drop_begin(NewWeights))
+        V = BranchWeights[0] / Values.size();
+      setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
+    }
+
+    // Add all of the 'cases' to the switch instruction.
+    for (ConstantInt *Val : Values)
+      New->addCase(Val, EdgeBB);
+
+    // We added edges from PI to the EdgeBB.  As such, if there were any
+    // PHI nodes in EdgeBB, they need entries to be added corresponding to
+    // the number of edges added.
+    for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
+      PHINode *PN = cast<PHINode>(BBI);
+      Value *InVal = PN->getIncomingValueForBlock(BB);
+      for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
+        PN->addIncoming(InVal, BB);
+    }
   }
 
   // Erase the old branch instruction.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4fcaf6d..43166c0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5608,6 +5608,7 @@ private:
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
+            SmallPtrSet<Value *, 4> ParentsUniqueUsers;
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
@@ -5636,6 +5637,22 @@ private:
                       Bundle->getTreeEntry()->isCopyableElement(In)) &&
                      "Missed TreeEntry operands?");
 
+              bool IsNonSchedulableWithParentPhiNode =
+                  Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
+                  Bundle->getTreeEntry()->UserTreeIndex &&
+                  Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
+                  Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
+                      Instruction::PHI;
+              // Count the number of unique phi nodes, which are the parent for
+              // parent entry, and exit, if all the unique phis are processed.
+              if (IsNonSchedulableWithParentPhiNode) {
+                const TreeEntry *ParentTE =
+                    Bundle->getTreeEntry()->UserTreeIndex.UserTE;
+                Value *User = ParentTE->Scalars[Lane];
+                if (!ParentsUniqueUsers.insert(User).second)
+                  break;
+              }
+
               for (unsigned OpIdx :
                    seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
                 if (auto *I = dyn_cast<Instruction>(
@@ -5644,8 +5661,8 @@ private:
                                     << *I << "\n");
                   DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
                 }
-              // If parent node is schedulable, it will be handle correctly.
-              if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+              // If parent node is schedulable, it will be handled correctly.
+              if (!IsNonSchedulableWithParentPhiNode)
                 break;
               It = std::find(std::next(It),
                              Bundle->getTreeEntry()->Scalars.end(), In);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index acad795..d9ac26bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3648,6 +3648,37 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
     Sub = VecOp->getDefiningRecipe();
     VecOp = Tmp;
   }
+
+  // If ValB is a constant and can be safely extended, truncate it to the same
+  // type as ExtA's operand, then extend it to the same type as ExtA. This
+  // creates two uniform extends that can more easily be matched by the rest of
+  // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
+  // replaced with the new extend of the constant.
+  auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
+                                           VPWidenCastRecipe *&ExtB,
+                                           VPValue *&ValB, VPWidenRecipe *Mul) {
+    if (!ExtA || ExtB || !ValB->isLiveIn())
+      return;
+    Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
+    Instruction::CastOps ExtOpc = ExtA->getOpcode();
+    const APInt *Const;
+    if (!match(ValB, m_APInt(Const)) ||
+        !llvm::canConstantBeExtended(
+            Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
+      return;
+    // The truncate ensures that the type of each extended operand is the
+    // same, and it's been proven that the constant can be extended from
+    // NarrowTy safely. Necessary since ExtA's extended operand would be
+    // e.g. an i8, while the const will likely be an i32. This will be
+    // elided by later optimisations.
+    VPBuilder Builder(Mul);
+    auto *Trunc =
+        Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
+    Type *WideTy = Ctx.Types.inferScalarType(ExtA);
+    ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
+    Mul->setOperand(1, ExtB);
+  };
+
   // Try to match reduce.add(mul(...)).
   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
     auto *RecipeA =
@@ -3656,6 +3687,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
     auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
 
+    // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
+    ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
+
     // Match reduce.add/sub(mul(ext, ext)).
     if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
         match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
@@ -3665,7 +3699,6 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
                                       cast<VPWidenRecipe>(Sub), Red);
       return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
     }
-    // Match reduce.add(mul).
     // TODO: Add an expression type for this variant with a negated mul
     if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
       return new VPExpressionRecipe(Mul, Red);
@@ -3674,18 +3707,26 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
   // variants.
   if (Sub)
     return nullptr;
-  // Match reduce.add(ext(mul(ext(A), ext(B)))).
-  // All extend recipes must have same opcode or A == B
-  // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
-  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
-                                      m_ZExtOrSExt(m_VPValue()))))) {
+
+  // Match reduce.add(ext(mul(A, B))).
+  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
     auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
     auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
-    auto *Ext0 =
-        cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
-    auto *Ext1 =
-        cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
-    if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
+    auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
+    auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+
+    // reduce.add(ext(mul(ext, const)))
+    // -> reduce.add(ext(mul(ext, ext(const))))
+    ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
+
+    // reduce.add(ext(mul(ext(A), ext(B))))
+    // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
+    // The inner extends must either have the same opcode as the outer extend or
+    // be the same, in which case the multiply can never result in a negative
+    // value and the outer extend can be folded away by doing wider
+    // extends for the operands of the mul.
+    if (Ext0 && Ext1 &&
+        (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
         Ext0->getOpcode() == Ext1->getOpcode() &&
         IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
       auto *NewExt0 = new VPWidenCastRecipe(
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index 41f7ab8..480fcbd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #32
-; GISEL-NEXT:    lsr x13, x9, #32
-; GISEL-NEXT:    lsl x8, x8, #32
-; GISEL-NEXT:    orr x9, x10, x9, lsl #32
-; GISEL-NEXT:    lsr x10, x11, #32
-; GISEL-NEXT:    orr x11, x13, x11, lsl #32
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #32
-; GISEL-NEXT:    orr x10, x10, x12, lsl #32
-; GISEL-NEXT:    lsr x12, x14, #32
-; GISEL-NEXT:    lsr x9, x15, #32
-; GISEL-NEXT:    orr x8, x8, x14, lsl #32
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #32
-; GISEL-NEXT:    lsr x12, x13, #32
-; GISEL-NEXT:    orr x9, x9, x13, lsl #32
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #32
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #32
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #32
+; GISEL-NEXT:    extr x10, x15, x14, #32
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x13, x9, #32
-; GISEL-NEXT:    lsl x15, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    orr x8, x13, x8, lsr #32
-; GISEL-NEXT:    lsl x13, x14, #32
-; GISEL-NEXT:    orr x9, x15, x9, lsr #32
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #32
-; GISEL-NEXT:    lsl x8, x16, #32
-; GISEL-NEXT:    lsl x11, x12, #32
-; GISEL-NEXT:    lsl x13, x15, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #32
-; GISEL-NEXT:    lsr x10, x16, #32
-; GISEL-NEXT:    orr x11, x11, x14, lsr #32
-; GISEL-NEXT:    orr x9, x13, x12, lsr #32
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #32
+; GISEL-NEXT:    extr x8, x15, x14, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #32
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x15, x9, #32
-; GISEL-NEXT:    lsl x16, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #32
-; GISEL-NEXT:    lsl x15, x13, #32
-; GISEL-NEXT:    orr x9, x16, x9, lsr #32
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #32
-; GISEL-NEXT:    orr x10, x15, x10, lsr #32
-; GISEL-NEXT:    lsl x15, x12, #32
-; GISEL-NEXT:    orr x8, x11, x13, lsr #32
-; GISEL-NEXT:    lsl x11, x17, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #32
-; GISEL-NEXT:    lsl x13, x16, #32
-; GISEL-NEXT:    orr x10, x11, x12, lsr #32
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #32
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #32
+; GISEL-NEXT:    extr x9, x15, x14, #32
+; GISEL-NEXT:    lsl x8, x8, #32
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #32
+; GISEL-NEXT:    extr x11, x13, x12, #32
+; GISEL-NEXT:    orr x8, x8, x13, asr #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #32
-; GISEL-NEXT:    lsr x16, x9, #32
-; GISEL-NEXT:    lsl x8, x8, #32
-; GISEL-NEXT:    orr x9, x14, x9, lsl #32
-; GISEL-NEXT:    lsr x14, x10, #32
-; GISEL-NEXT:    orr x10, x16, x10, lsl #32
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #32
-; GISEL-NEXT:    orr x11, x14, x11, lsl #32
-; GISEL-NEXT:    lsr x14, x12, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #32
-; GISEL-NEXT:    orr x8, x8, x12, lsl #32
-; GISEL-NEXT:    orr x10, x14, x13, lsl #32
-; GISEL-NEXT:    orr x9, x9, x15, lsl #32
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #32
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_96:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x13, x9, #32
-; GISEL-NEXT:    orr x10, x12, x10, lsr #32
-; GISEL-NEXT:    lsl x12, x11, #32
-; GISEL-NEXT:    orr x8, x13, x8, lsr #32
-; GISEL-NEXT:    lsl x13, x14, #32
-; GISEL-NEXT:    orr x9, x12, x9, lsr #32
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #32
-; GISEL-NEXT:    orr x11, x13, x11, lsr #32
-; GISEL-NEXT:    lsl x12, x16, #32
-; GISEL-NEXT:    orr x8, x10, x14, lsr #32
-; GISEL-NEXT:    lsr x10, x16, #32
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #32
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #32
+; GISEL-NEXT:    lsr x8, x14, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_96:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x14, x9, #32
-; GISEL-NEXT:    lsl x15, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #32
-; GISEL-NEXT:    lsl x14, x13, #32
-; GISEL-NEXT:    orr x9, x15, x9, lsr #32
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #32
-; GISEL-NEXT:    orr x10, x14, x10, lsr #32
-; GISEL-NEXT:    lsl x14, x16, #32
-; GISEL-NEXT:    orr x8, x11, x13, lsr #32
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #32
+; GISEL-NEXT:    extr x9, x14, x13, #32
 ; GISEL-NEXT:    lsl x11, x15, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #32
-; GISEL-NEXT:    orr x10, x11, x16, asr #32
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #32
+; GISEL-NEXT:    orr x8, x11, x12, asr #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #63
-; GISEL-NEXT:    lsr x13, x9, #63
-; GISEL-NEXT:    lsl x8, x8, #1
-; GISEL-NEXT:    orr x9, x10, x9, lsl #1
-; GISEL-NEXT:    lsr x10, x11, #63
-; GISEL-NEXT:    orr x11, x13, x11, lsl #1
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #63
-; GISEL-NEXT:    orr x10, x10, x12, lsl #1
-; GISEL-NEXT:    lsr x12, x14, #63
-; GISEL-NEXT:    lsr x9, x15, #63
-; GISEL-NEXT:    orr x8, x8, x14, lsl #1
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #1
-; GISEL-NEXT:    lsr x12, x13, #63
-; GISEL-NEXT:    orr x9, x9, x13, lsl #1
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #1
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #1
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #63
+; GISEL-NEXT:    extr x10, x15, x14, #63
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x13, x9, #63
-; GISEL-NEXT:    lsl x15, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #1
-; GISEL-NEXT:    lsl x13, x14, #63
-; GISEL-NEXT:    orr x9, x15, x9, lsr #1
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #1
-; GISEL-NEXT:    lsl x8, x16, #63
-; GISEL-NEXT:    lsl x11, x12, #63
-; GISEL-NEXT:    lsl x13, x15, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #1
-; GISEL-NEXT:    lsr x10, x16, #1
-; GISEL-NEXT:    orr x11, x11, x14, lsr #1
-; GISEL-NEXT:    orr x9, x13, x12, lsr #1
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #1
+; GISEL-NEXT:    extr x8, x15, x14, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #1
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x15, x9, #63
-; GISEL-NEXT:    lsl x16, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #1
-; GISEL-NEXT:    lsl x15, x13, #63
-; GISEL-NEXT:    orr x9, x16, x9, lsr #1
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #63
-; GISEL-NEXT:    orr x10, x15, x10, lsr #1
-; GISEL-NEXT:    lsl x15, x12, #63
-; GISEL-NEXT:    orr x8, x11, x13, lsr #1
-; GISEL-NEXT:    lsl x11, x17, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #1
-; GISEL-NEXT:    lsl x13, x16, #63
-; GISEL-NEXT:    orr x10, x11, x12, lsr #1
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #1
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #1
+; GISEL-NEXT:    extr x9, x15, x14, #1
+; GISEL-NEXT:    lsl x8, x8, #63
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #1
+; GISEL-NEXT:    extr x11, x13, x12, #1
+; GISEL-NEXT:    orr x8, x8, x13, asr #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #49
-; GISEL-NEXT:    lsr x13, x9, #49
-; GISEL-NEXT:    lsl x8, x8, #15
-; GISEL-NEXT:    orr x9, x10, x9, lsl #15
-; GISEL-NEXT:    lsr x10, x11, #49
-; GISEL-NEXT:    orr x11, x13, x11, lsl #15
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #49
-; GISEL-NEXT:    orr x10, x10, x12, lsl #15
-; GISEL-NEXT:    lsr x12, x14, #49
-; GISEL-NEXT:    lsr x9, x15, #49
-; GISEL-NEXT:    orr x8, x8, x14, lsl #15
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #15
-; GISEL-NEXT:    lsr x12, x13, #49
-; GISEL-NEXT:    orr x9, x9, x13, lsl #15
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #15
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #15
+; GISEL-NEXT:    extr x8, x9, x8, #49
+; GISEL-NEXT:    extr x9, x10, x9, #49
+; GISEL-NEXT:    extr x10, x11, x10, #49
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #49
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #49
+; GISEL-NEXT:    extr x10, x15, x14, #49
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #49
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #49
-; GISEL-NEXT:    lsl x13, x9, #49
-; GISEL-NEXT:    lsl x15, x10, #49
-; GISEL-NEXT:    orr x11, x12, x11, lsr #15
-; GISEL-NEXT:    orr x8, x13, x8, lsr #15
-; GISEL-NEXT:    lsl x13, x14, #49
-; GISEL-NEXT:    orr x9, x15, x9, lsr #15
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #15
-; GISEL-NEXT:    lsl x8, x16, #49
-; GISEL-NEXT:    lsl x11, x12, #49
-; GISEL-NEXT:    lsl x13, x15, #49
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #15
-; GISEL-NEXT:    lsr x10, x16, #15
-; GISEL-NEXT:    orr x11, x11, x14, lsr #15
-; GISEL-NEXT:    orr x9, x13, x12, lsr #15
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #15
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #15
+; GISEL-NEXT:    extr x10, x11, x10, #15
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #15
+; GISEL-NEXT:    extr x9, x13, x12, #15
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #15
+; GISEL-NEXT:    extr x8, x15, x14, #15
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #15
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #49
-; GISEL-NEXT:    lsl x15, x9, #49
-; GISEL-NEXT:    lsl x16, x10, #49
-; GISEL-NEXT:    orr x11, x12, x11, lsr #15
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #15
-; GISEL-NEXT:    lsl x15, x13, #49
-; GISEL-NEXT:    orr x9, x16, x9, lsr #15
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #49
-; GISEL-NEXT:    orr x10, x15, x10, lsr #15
-; GISEL-NEXT:    lsl x15, x12, #49
-; GISEL-NEXT:    orr x8, x11, x13, lsr #15
-; GISEL-NEXT:    lsl x11, x17, #49
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #15
-; GISEL-NEXT:    lsl x13, x16, #49
-; GISEL-NEXT:    orr x10, x11, x12, lsr #15
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #15
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #15
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #15
+; GISEL-NEXT:    extr x10, x11, x10, #15
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #15
+; GISEL-NEXT:    extr x9, x15, x14, #15
+; GISEL-NEXT:    lsl x8, x8, #49
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #15
+; GISEL-NEXT:    extr x11, x13, x12, #15
+; GISEL-NEXT:    orr x8, x8, x13, asr #15
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #1
-; GISEL-NEXT:    lsr x13, x9, #1
-; GISEL-NEXT:    lsl x8, x8, #63
-; GISEL-NEXT:    orr x9, x10, x9, lsl #63
-; GISEL-NEXT:    lsr x10, x11, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsl #63
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #1
-; GISEL-NEXT:    orr x10, x10, x12, lsl #63
-; GISEL-NEXT:    lsr x12, x14, #1
-; GISEL-NEXT:    lsr x9, x15, #1
-; GISEL-NEXT:    orr x8, x8, x14, lsl #63
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #63
-; GISEL-NEXT:    lsr x12, x13, #1
-; GISEL-NEXT:    orr x9, x9, x13, lsl #63
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #63
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #63
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #1
+; GISEL-NEXT:    extr x10, x15, x14, #1
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    lsl x15, x10, #1
-; GISEL-NEXT:    orr x11, x12, x11, lsr #63
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x15, x9, lsr #63
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #63
-; GISEL-NEXT:    lsl x8, x16, #1
-; GISEL-NEXT:    lsl x11, x12, #1
-; GISEL-NEXT:    lsl x13, x15, #1
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #63
-; GISEL-NEXT:    lsr x10, x16, #63
-; GISEL-NEXT:    orr x11, x11, x14, lsr #63
-; GISEL-NEXT:    orr x9, x13, x12, lsr #63
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    extr x8, x15, x14, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #63
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x10, [x1]
-; GISEL-NEXT:    ldp x11, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x15, x9, #1
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x16, x11, #1
-; GISEL-NEXT:    orr x8, x15, x8, lsr #63
-; GISEL-NEXT:    lsl x15, x13, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x9, x16, x9, lsr #63
-; GISEL-NEXT:    orr x11, x15, x11, lsr #63
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x8, x17, #1
-; GISEL-NEXT:    lsl x16, x14, #1
-; GISEL-NEXT:    lsl x10, x12, #1
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    asr x9, x17, #63
-; GISEL-NEXT:    orr x8, x8, x12, lsr #63
-; GISEL-NEXT:    orr x13, x16, x13, lsr #63
-; GISEL-NEXT:    orr x10, x10, x14, lsr #63
-; GISEL-NEXT:    orr x9, x9, x9, lsl #1
-; GISEL-NEXT:    stp x13, x10, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    extr x11, x14, x13, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    asr x10, x15, #63
+; GISEL-NEXT:    extr x8, x15, x14, #63
+; GISEL-NEXT:    stp x9, x11, [x0, #32]
+; GISEL-NEXT:    orr x9, x10, x10, lsl #1
 ; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
@@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #63
-; GISEL-NEXT:    lsr x16, x9, #63
-; GISEL-NEXT:    lsl x8, x8, #1
-; GISEL-NEXT:    orr x9, x14, x9, lsl #1
-; GISEL-NEXT:    lsr x14, x10, #63
-; GISEL-NEXT:    orr x10, x16, x10, lsl #1
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #63
-; GISEL-NEXT:    orr x11, x14, x11, lsl #1
-; GISEL-NEXT:    lsr x14, x12, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #63
-; GISEL-NEXT:    orr x8, x8, x12, lsl #1
-; GISEL-NEXT:    orr x10, x14, x13, lsl #1
-; GISEL-NEXT:    orr x9, x9, x15, lsl #1
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #1
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_65:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x13, x9, #63
-; GISEL-NEXT:    orr x10, x12, x10, lsr #1
-; GISEL-NEXT:    lsl x12, x11, #63
-; GISEL-NEXT:    orr x8, x13, x8, lsr #1
-; GISEL-NEXT:    lsl x13, x14, #63
-; GISEL-NEXT:    orr x9, x12, x9, lsr #1
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #63
-; GISEL-NEXT:    orr x11, x13, x11, lsr #1
-; GISEL-NEXT:    lsl x12, x16, #63
-; GISEL-NEXT:    orr x8, x10, x14, lsr #1
-; GISEL-NEXT:    lsr x10, x16, #1
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #1
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #1
+; GISEL-NEXT:    lsr x8, x14, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_65:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x14, x9, #63
-; GISEL-NEXT:    lsl x15, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #1
-; GISEL-NEXT:    lsl x14, x13, #63
-; GISEL-NEXT:    orr x9, x15, x9, lsr #1
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #63
-; GISEL-NEXT:    orr x10, x14, x10, lsr #1
-; GISEL-NEXT:    lsl x14, x16, #63
-; GISEL-NEXT:    orr x8, x11, x13, lsr #1
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #1
+; GISEL-NEXT:    extr x9, x14, x13, #1
 ; GISEL-NEXT:    lsl x11, x15, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #1
-; GISEL-NEXT:    orr x10, x11, x16, asr #1
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #1
+; GISEL-NEXT:    orr x8, x11, x12, asr #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #28
-; GISEL-NEXT:    lsr x16, x9, #28
-; GISEL-NEXT:    lsl x8, x8, #36
-; GISEL-NEXT:    orr x9, x14, x9, lsl #36
-; GISEL-NEXT:    lsr x14, x10, #28
-; GISEL-NEXT:    orr x10, x16, x10, lsl #36
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #28
-; GISEL-NEXT:    orr x11, x14, x11, lsl #36
-; GISEL-NEXT:    lsr x14, x12, #28
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #28
-; GISEL-NEXT:    orr x8, x8, x12, lsl #36
-; GISEL-NEXT:    orr x10, x14, x13, lsl #36
-; GISEL-NEXT:    orr x9, x9, x15, lsl #36
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #36
+; GISEL-NEXT:    extr x8, x9, x8, #28
+; GISEL-NEXT:    extr x9, x10, x9, #28
+; GISEL-NEXT:    extr x10, x11, x10, #28
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #28
+; GISEL-NEXT:    extr x9, x13, x12, #28
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #28
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_100:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #28
-; GISEL-NEXT:    lsl x13, x9, #28
-; GISEL-NEXT:    orr x10, x12, x10, lsr #36
-; GISEL-NEXT:    lsl x12, x11, #28
-; GISEL-NEXT:    orr x8, x13, x8, lsr #36
-; GISEL-NEXT:    lsl x13, x14, #28
-; GISEL-NEXT:    orr x9, x12, x9, lsr #36
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #28
-; GISEL-NEXT:    orr x11, x13, x11, lsr #36
-; GISEL-NEXT:    lsl x12, x16, #28
-; GISEL-NEXT:    orr x8, x10, x14, lsr #36
-; GISEL-NEXT:    lsr x10, x16, #36
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #36
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #36
+; GISEL-NEXT:    extr x9, x10, x9, #36
+; GISEL-NEXT:    extr x10, x11, x10, #36
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #36
+; GISEL-NEXT:    extr x9, x13, x12, #36
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #36
+; GISEL-NEXT:    lsr x8, x14, #36
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_100:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #28
-; GISEL-NEXT:    lsl x14, x9, #28
-; GISEL-NEXT:    lsl x15, x10, #28
-; GISEL-NEXT:    orr x11, x12, x11, lsr #36
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #36
-; GISEL-NEXT:    lsl x14, x13, #28
-; GISEL-NEXT:    orr x9, x15, x9, lsr #36
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #28
-; GISEL-NEXT:    orr x10, x14, x10, lsr #36
-; GISEL-NEXT:    lsl x14, x16, #28
-; GISEL-NEXT:    orr x8, x11, x13, lsr #36
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #36
+; GISEL-NEXT:    extr x9, x10, x9, #36
+; GISEL-NEXT:    extr x10, x11, x10, #36
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #36
+; GISEL-NEXT:    extr x9, x14, x13, #36
 ; GISEL-NEXT:    lsl x11, x15, #28
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #36
-; GISEL-NEXT:    orr x10, x11, x16, asr #36
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #36
+; GISEL-NEXT:    orr x8, x11, x12, asr #36
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #1
-; GISEL-NEXT:    lsr x16, x9, #1
-; GISEL-NEXT:    lsl x8, x8, #63
-; GISEL-NEXT:    orr x9, x14, x9, lsl #63
-; GISEL-NEXT:    lsr x14, x10, #1
-; GISEL-NEXT:    orr x10, x16, x10, lsl #63
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #1
-; GISEL-NEXT:    orr x11, x14, x11, lsl #63
-; GISEL-NEXT:    lsr x14, x12, #1
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #1
-; GISEL-NEXT:    orr x8, x8, x12, lsl #63
-; GISEL-NEXT:    orr x10, x14, x13, lsl #63
-; GISEL-NEXT:    orr x9, x9, x15, lsl #63
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #63
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_127:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    lsl x12, x11, #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x12, x9, lsr #63
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsr #63
-; GISEL-NEXT:    lsl x12, x16, #1
-; GISEL-NEXT:    orr x8, x10, x14, lsr #63
-; GISEL-NEXT:    lsr x10, x16, #63
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #63
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    lsr x8, x14, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_127:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    lsl x12, x11, #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x12, x9, lsr #63
-; GISEL-NEXT:    lsl x12, x15, #1
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x16, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsr #63
-; GISEL-NEXT:    asr x8, x16, #63
-; GISEL-NEXT:    orr x12, x12, x14, lsr #63
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x10, x15, lsr #63
-; GISEL-NEXT:    orr x10, x8, x8, lsl #1
-; GISEL-NEXT:    stp x12, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    asr x9, x14, #63
+; GISEL-NEXT:    extr x11, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    orr x8, x9, x9, lsl #1
+; GISEL-NEXT:    stp x11, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll
index 12e8bf2..03f3cf1 100644
--- a/llvm/test/CodeGen/AArch64/adc.ll
+++ b/llvm/test/CodeGen/AArch64/adc.ll
@@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) {
 ;
 ; CHECK-GI-LABEL: test_shifted:
 ; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    lsr x8, x2, #19
+; CHECK-GI-NEXT:    extr x8, x3, x2, #19
 ; CHECK-GI-NEXT:    adds x0, x0, x2, lsl #45
-; CHECK-GI-NEXT:    orr x8, x8, x3, lsl #45
 ; CHECK-GI-NEXT:    adc x1, x1, x8
 ; CHECK-GI-NEXT:    ret
   %rhs = shl i128 %b, 45
@@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) {
 ; CHECK-GI-NEXT:    sxth x8, w2
 ; CHECK-GI-NEXT:    adds x0, x0, w2, sxth #3
 ; CHECK-GI-NEXT:    asr x9, x8, #63
-; CHECK-GI-NEXT:    lsr x8, x8, #61
-; CHECK-GI-NEXT:    orr x8, x8, x9, lsl #3
+; CHECK-GI-NEXT:    extr x8, x9, x8, #61
 ; CHECK-GI-NEXT:    adc x1, x1, x8
 ; CHECK-GI-NEXT:    ret
   %ext = sext i16 %b to i128
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 765f6b7..7f07ef4 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) {
 ;
 ; CHECK-GI-LABEL: fshl_i128:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
 ; CHECK-GI-NEXT:    and x9, x4, #0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x14, x3, #63
-; CHECK-GI-NEXT:    sub x12, x10, x9
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    sub x12, x8, x9
 ; CHECK-GI-NEXT:    lsl x13, x1, x9
-; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    bic x10, x10, x4
 ; CHECK-GI-NEXT:    lsr x12, x0, x12
-; CHECK-GI-NEXT:    bic x8, x8, x4
-; CHECK-GI-NEXT:    sub x15, x9, #64
+; CHECK-GI-NEXT:    sub x14, x9, #64
+; CHECK-GI-NEXT:    lsl x15, x0, x9
+; CHECK-GI-NEXT:    extr x16, x3, x2, #1
 ; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x9, x0, x9
-; CHECK-GI-NEXT:    lsl x15, x0, x15
-; CHECK-GI-NEXT:    orr x12, x12, x13
-; CHECK-GI-NEXT:    orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT:    lsr x14, x3, #1
-; CHECK-GI-NEXT:    sub x10, x10, x8
-; CHECK-GI-NEXT:    sub x16, x8, #64
-; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
-; CHECK-GI-NEXT:    lsr x17, x13, x8
-; CHECK-GI-NEXT:    lsl x10, x14, x10
-; CHECK-GI-NEXT:    csel x12, x12, x15, lo
+; CHECK-GI-NEXT:    sub x8, x8, x10
+; CHECK-GI-NEXT:    orr x9, x12, x13
+; CHECK-GI-NEXT:    lsr x12, x3, #1
+; CHECK-GI-NEXT:    lsl x13, x0, x14
+; CHECK-GI-NEXT:    csel x14, x15, xzr, lo
+; CHECK-GI-NEXT:    sub x15, x10, #64
+; CHECK-GI-NEXT:    lsr x17, x16, x10
+; CHECK-GI-NEXT:    lsl x8, x12, x8
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    lsr x15, x14, x16
+; CHECK-GI-NEXT:    lsr x13, x12, x15
 ; CHECK-GI-NEXT:    mvn x11, x4
-; CHECK-GI-NEXT:    csel x12, x1, x12, eq
-; CHECK-GI-NEXT:    orr x10, x17, x10
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    lsr x14, x14, x8
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    csel x9, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x17, x8
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x12, x12, x10
+; CHECK-GI-NEXT:    csel x8, x8, x13, lo
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    csel x10, x13, x10, eq
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    csel x8, x14, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x10
-; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    csel x8, x16, x8, eq
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    csel x10, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x14, x8
+; CHECK-GI-NEXT:    orr x1, x9, x10
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c)
@@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) {
 ;
 ; CHECK-GI-LABEL: fshr_i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #63
-; CHECK-GI-NEXT:    mov w9, #127 // =0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    bic x9, x9, x4
-; CHECK-GI-NEXT:    lsl x11, x0, #1
-; CHECK-GI-NEXT:    and x12, x4, #0x7f
-; CHECK-GI-NEXT:    orr x8, x8, x1, lsl #1
-; CHECK-GI-NEXT:    sub x14, x10, x9
-; CHECK-GI-NEXT:    sub x17, x9, #64
-; CHECK-GI-NEXT:    lsl x15, x11, x9
-; CHECK-GI-NEXT:    lsr x14, x11, x14
-; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x16, x8, x9
-; CHECK-GI-NEXT:    sub x9, x10, x12
-; CHECK-GI-NEXT:    lsl x10, x11, x17
-; CHECK-GI-NEXT:    mvn x13, x4
-; CHECK-GI-NEXT:    csel x11, x15, xzr, lo
-; CHECK-GI-NEXT:    sub x15, x12, #64
-; CHECK-GI-NEXT:    orr x14, x14, x16
-; CHECK-GI-NEXT:    lsr x16, x2, x12
-; CHECK-GI-NEXT:    lsl x9, x3, x9
-; CHECK-GI-NEXT:    csel x10, x14, x10, lo
-; CHECK-GI-NEXT:    tst x13, #0x7f
-; CHECK-GI-NEXT:    lsr x13, x3, x15
-; CHECK-GI-NEXT:    csel x8, x8, x10, eq
-; CHECK-GI-NEXT:    orr x9, x16, x9
-; CHECK-GI-NEXT:    cmp x12, #64
-; CHECK-GI-NEXT:    lsr x10, x3, x12
-; CHECK-GI-NEXT:    csel x9, x9, x13, lo
+; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    lsl x9, x0, #1
+; CHECK-GI-NEXT:    extr x10, x1, x0, #63
+; CHECK-GI-NEXT:    bic x8, x8, x4
+; CHECK-GI-NEXT:    mov w11, #64 // =0x40
+; CHECK-GI-NEXT:    and x14, x4, #0x7f
+; CHECK-GI-NEXT:    sub x12, x11, x8
+; CHECK-GI-NEXT:    lsl x13, x10, x8
+; CHECK-GI-NEXT:    lsl x16, x9, x8
+; CHECK-GI-NEXT:    lsr x12, x9, x12
+; CHECK-GI-NEXT:    sub x17, x8, #64
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsl x8, x9, x17
+; CHECK-GI-NEXT:    sub x11, x11, x14
+; CHECK-GI-NEXT:    mvn x15, x4
+; CHECK-GI-NEXT:    orr x12, x12, x13
+; CHECK-GI-NEXT:    csel x9, x16, xzr, lo
+; CHECK-GI-NEXT:    sub x13, x14, #64
+; CHECK-GI-NEXT:    lsr x16, x2, x14
+; CHECK-GI-NEXT:    lsl x11, x3, x11
+; CHECK-GI-NEXT:    csel x8, x12, x8, lo
+; CHECK-GI-NEXT:    tst x15, #0x7f
+; CHECK-GI-NEXT:    lsr x12, x3, x13
+; CHECK-GI-NEXT:    csel x8, x10, x8, eq
+; CHECK-GI-NEXT:    orr x10, x16, x11
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    lsr x11, x3, x14
+; CHECK-GI-NEXT:    csel x10, x10, x12, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    csel x9, x2, x9, eq
-; CHECK-GI-NEXT:    cmp x12, #64
-; CHECK-GI-NEXT:    csel x10, x10, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x11, x9
-; CHECK-GI-NEXT:    orr x1, x8, x10
+; CHECK-GI-NEXT:    csel x10, x2, x10, eq
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    csel x11, x11, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x9, x10
+; CHECK-GI-NEXT:    orr x1, x8, x11
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c)
@@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) {
 ;
 ; CHECK-GI-LABEL: rotl_i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #61
-; CHECK-GI-NEXT:    lsr x9, x1, #61
-; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    extr x8, x1, x0, #61
+; CHECK-GI-NEXT:    extr x0, x0, x1, #61
+; CHECK-GI-NEXT:    mov x1, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3)
@@ -731,20 +728,12 @@ entry:
 }
 
 define i128 @rotr_i128_c(i128 %a) {
-; CHECK-SD-LABEL: rotr_i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x1, x0, #3
-; CHECK-SD-NEXT:    extr x1, x0, x1, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: rotr_i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x0, #61
-; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT:    orr x1, x9, x1, lsr #3
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: rotr_i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x1, x0, #3
+; CHECK-NEXT:    extr x1, x0, x1, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3)
   ret i128 %d
@@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) {
 ;
 ; CHECK-GI-LABEL: fshl_i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #61
-; CHECK-GI-NEXT:    lsr x9, x3, #61
-; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x0, x0, x3, #61
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3)
@@ -879,21 +866,12 @@ entry:
 }
 
 define i128 @fshr_i128_c(i128 %a, i128 %b) {
-; CHECK-SD-LABEL: fshr_i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x3, x2, #3
-; CHECK-SD-NEXT:    extr x1, x0, x3, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fshr_i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x3, #61
-; CHECK-GI-NEXT:    lsr x9, x3, #3
-; CHECK-GI-NEXT:    orr x8, x8, x2, lsr #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsl #61
-; CHECK-GI-NEXT:    mov x0, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fshr_i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x3, x2, #3
+; CHECK-NEXT:    extr x1, x0, x3, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3)
   ret i128 %d
@@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    .cfi_offset w19, -16
 ; CHECK-GI-NEXT:    ldr x11, [sp, #16]
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
+; CHECK-GI-NEXT:    mov w9, #64 // =0x40
 ; CHECK-GI-NEXT:    ldr x12, [sp, #32]
 ; CHECK-GI-NEXT:    mov w13, #127 // =0x7f
-; CHECK-GI-NEXT:    and x9, x11, #0x7f
+; CHECK-GI-NEXT:    and x8, x11, #0x7f
 ; CHECK-GI-NEXT:    and x14, x12, #0x7f
-; CHECK-GI-NEXT:    mvn x15, x11
-; CHECK-GI-NEXT:    sub x8, x10, x9
-; CHECK-GI-NEXT:    sub x16, x9, #64
-; CHECK-GI-NEXT:    lsl x19, x1, x9
-; CHECK-GI-NEXT:    lsr x18, x0, x8
-; CHECK-GI-NEXT:    lsl x17, x0, x9
-; CHECK-GI-NEXT:    lsl x16, x0, x16
-; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    bic x0, x13, x11
-; CHECK-GI-NEXT:    mvn x8, x12
-; CHECK-GI-NEXT:    orr x18, x18, x19
-; CHECK-GI-NEXT:    csel x9, x17, xzr, lo
+; CHECK-GI-NEXT:    mvn x18, x11
+; CHECK-GI-NEXT:    sub x10, x9, x8
+; CHECK-GI-NEXT:    sub x15, x8, #64
+; CHECK-GI-NEXT:    lsl x17, x1, x8
+; CHECK-GI-NEXT:    lsr x16, x0, x10
+; CHECK-GI-NEXT:    lsl x15, x0, x15
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsl x19, x0, x8
+; CHECK-GI-NEXT:    lsl x0, x3, x14
+; CHECK-GI-NEXT:    mvn x10, x12
+; CHECK-GI-NEXT:    orr x16, x16, x17
 ; CHECK-GI-NEXT:    sub x17, x14, #64
-; CHECK-GI-NEXT:    csel x16, x18, x16, lo
+; CHECK-GI-NEXT:    csel x15, x16, x15, lo
+; CHECK-GI-NEXT:    sub x16, x9, x14
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lo
+; CHECK-GI-NEXT:    lsr x16, x2, x16
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    sub x11, x10, x14
-; CHECK-GI-NEXT:    lsr x11, x2, x11
-; CHECK-GI-NEXT:    lsl x18, x3, x14
-; CHECK-GI-NEXT:    csel x16, x1, x16, eq
-; CHECK-GI-NEXT:    lsl x1, x2, x14
+; CHECK-GI-NEXT:    lsl x19, x2, x14
 ; CHECK-GI-NEXT:    lsl x17, x2, x17
+; CHECK-GI-NEXT:    csel x15, x1, x15, eq
 ; CHECK-GI-NEXT:    cmp x14, #64
-; CHECK-GI-NEXT:    lsl x14, x5, #63
-; CHECK-GI-NEXT:    orr x11, x11, x18
-; CHECK-GI-NEXT:    bic x13, x13, x12
-; CHECK-GI-NEXT:    csel x18, x1, xzr, lo
-; CHECK-GI-NEXT:    csel x11, x11, x17, lo
+; CHECK-GI-NEXT:    orr x16, x16, x0
+; CHECK-GI-NEXT:    bic x11, x13, x11
+; CHECK-GI-NEXT:    csel x14, x19, xzr, lo
+; CHECK-GI-NEXT:    csel x16, x16, x17, lo
 ; CHECK-GI-NEXT:    tst x12, #0x7f
-; CHECK-GI-NEXT:    lsr x12, x5, #1
-; CHECK-GI-NEXT:    orr x14, x14, x4, lsr #1
-; CHECK-GI-NEXT:    lsl x17, x7, #63
-; CHECK-GI-NEXT:    sub x1, x10, x0
-; CHECK-GI-NEXT:    csel x11, x3, x11, eq
-; CHECK-GI-NEXT:    sub x2, x0, #64
-; CHECK-GI-NEXT:    lsr x3, x14, x0
-; CHECK-GI-NEXT:    lsl x1, x12, x1
-; CHECK-GI-NEXT:    lsr x4, x7, #1
-; CHECK-GI-NEXT:    orr x17, x17, x6, lsr #1
-; CHECK-GI-NEXT:    lsr x2, x12, x2
-; CHECK-GI-NEXT:    cmp x0, #64
-; CHECK-GI-NEXT:    orr x1, x3, x1
-; CHECK-GI-NEXT:    sub x10, x10, x13
-; CHECK-GI-NEXT:    lsr x12, x12, x0
-; CHECK-GI-NEXT:    csel x1, x1, x2, lo
-; CHECK-GI-NEXT:    tst x15, #0x7f
-; CHECK-GI-NEXT:    sub x15, x13, #64
-; CHECK-GI-NEXT:    lsr x2, x17, x13
-; CHECK-GI-NEXT:    lsl x10, x4, x10
-; CHECK-GI-NEXT:    csel x14, x14, x1, eq
-; CHECK-GI-NEXT:    cmp x0, #64
-; CHECK-GI-NEXT:    lsr x15, x4, x15
-; CHECK-GI-NEXT:    lsr x0, x4, x13
-; CHECK-GI-NEXT:    csel x12, x12, xzr, lo
-; CHECK-GI-NEXT:    orr x10, x2, x10
-; CHECK-GI-NEXT:    cmp x13, #64
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
-; CHECK-GI-NEXT:    tst x8, #0x7f
-; CHECK-GI-NEXT:    orr x1, x16, x12
-; CHECK-GI-NEXT:    csel x8, x17, x10, eq
-; CHECK-GI-NEXT:    cmp x13, #64
-; CHECK-GI-NEXT:    csel x10, x0, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x14
-; CHECK-GI-NEXT:    orr x2, x18, x8
-; CHECK-GI-NEXT:    orr x3, x11, x10
+; CHECK-GI-NEXT:    lsr x17, x5, #1
+; CHECK-GI-NEXT:    extr x0, x5, x4, #1
+; CHECK-GI-NEXT:    bic x12, x13, x12
+; CHECK-GI-NEXT:    csel x13, x3, x16, eq
+; CHECK-GI-NEXT:    sub x16, x9, x11
+; CHECK-GI-NEXT:    sub x1, x11, #64
+; CHECK-GI-NEXT:    lsr x3, x7, #1
+; CHECK-GI-NEXT:    lsr x2, x0, x11
+; CHECK-GI-NEXT:    lsl x16, x17, x16
+; CHECK-GI-NEXT:    extr x4, x7, x6, #1
+; CHECK-GI-NEXT:    lsr x1, x17, x1
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    sub x9, x9, x12
+; CHECK-GI-NEXT:    orr x16, x2, x16
+; CHECK-GI-NEXT:    lsr x17, x17, x11
+; CHECK-GI-NEXT:    lsl x9, x3, x9
+; CHECK-GI-NEXT:    csel x16, x16, x1, lo
+; CHECK-GI-NEXT:    tst x18, #0x7f
+; CHECK-GI-NEXT:    sub x18, x12, #64
+; CHECK-GI-NEXT:    lsr x1, x4, x12
+; CHECK-GI-NEXT:    csel x16, x0, x16, eq
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    lsr x11, x3, x18
+; CHECK-GI-NEXT:    csel x17, x17, xzr, lo
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    orr x9, x1, x9
+; CHECK-GI-NEXT:    lsr x18, x3, x12
+; CHECK-GI-NEXT:    orr x0, x8, x16
+; CHECK-GI-NEXT:    csel x9, x9, x11, lo
+; CHECK-GI-NEXT:    tst x10, #0x7f
+; CHECK-GI-NEXT:    orr x1, x15, x17
+; CHECK-GI-NEXT:    csel x9, x4, x9, eq
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    csel x10, x18, xzr, lo
+; CHECK-GI-NEXT:    orr x2, x14, x9
+; CHECK-GI-NEXT:    orr x3, x13, x10
 ; CHECK-GI-NEXT:    ldr x19, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
 ; CHECK-GI-LABEL: fshr_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr x9, [sp]
-; CHECK-GI-NEXT:    lsl x12, x1, #1
-; CHECK-GI-NEXT:    mov w11, #127 // =0x7f
-; CHECK-GI-NEXT:    mov w14, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x15, x0, #1
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    mov w12, #64 // =0x40
+; CHECK-GI-NEXT:    lsl x13, x0, #1
+; CHECK-GI-NEXT:    extr x14, x1, x0, #63
 ; CHECK-GI-NEXT:    ldr x8, [sp, #16]
-; CHECK-GI-NEXT:    bic x13, x11, x9
-; CHECK-GI-NEXT:    orr x12, x12, x0, lsr #63
-; CHECK-GI-NEXT:    lsl x1, x3, #1
-; CHECK-GI-NEXT:    sub x17, x14, x13
-; CHECK-GI-NEXT:    sub x18, x13, #64
-; CHECK-GI-NEXT:    lsl x3, x15, x13
-; CHECK-GI-NEXT:    lsr x17, x15, x17
-; CHECK-GI-NEXT:    lsl x0, x12, x13
-; CHECK-GI-NEXT:    lsl x15, x15, x18
-; CHECK-GI-NEXT:    bic x11, x11, x8
+; CHECK-GI-NEXT:    bic x11, x10, x9
+; CHECK-GI-NEXT:    mvn x16, x9
+; CHECK-GI-NEXT:    and x15, x9, #0x7f
+; CHECK-GI-NEXT:    sub x17, x12, x11
+; CHECK-GI-NEXT:    sub x18, x11, #64
+; CHECK-GI-NEXT:    lsl x0, x14, x11
+; CHECK-GI-NEXT:    lsr x17, x13, x17
+; CHECK-GI-NEXT:    lsl x1, x13, x11
+; CHECK-GI-NEXT:    lsl x13, x13, x18
+; CHECK-GI-NEXT:    bic x10, x10, x8
 ; CHECK-GI-NEXT:    lsl x18, x2, #1
-; CHECK-GI-NEXT:    cmp x13, #64
+; CHECK-GI-NEXT:    cmp x11, #64
 ; CHECK-GI-NEXT:    orr x17, x17, x0
-; CHECK-GI-NEXT:    orr x13, x1, x2, lsr #63
-; CHECK-GI-NEXT:    mvn x16, x9
-; CHECK-GI-NEXT:    csel x15, x17, x15, lo
-; CHECK-GI-NEXT:    sub x17, x14, x11
-; CHECK-GI-NEXT:    csel x0, x3, xzr, lo
+; CHECK-GI-NEXT:    extr x11, x3, x2, #63
+; CHECK-GI-NEXT:    csel x0, x1, xzr, lo
+; CHECK-GI-NEXT:    csel x13, x17, x13, lo
+; CHECK-GI-NEXT:    sub x17, x12, x10
 ; CHECK-GI-NEXT:    tst x16, #0x7f
-; CHECK-GI-NEXT:    sub x16, x11, #64
+; CHECK-GI-NEXT:    sub x16, x10, #64
 ; CHECK-GI-NEXT:    lsr x17, x18, x17
-; CHECK-GI-NEXT:    lsl x2, x13, x11
-; CHECK-GI-NEXT:    lsl x1, x18, x11
-; CHECK-GI-NEXT:    csel x12, x12, x15, eq
-; CHECK-GI-NEXT:    lsl x15, x18, x16
-; CHECK-GI-NEXT:    and x10, x9, #0x7f
-; CHECK-GI-NEXT:    cmp x11, #64
-; CHECK-GI-NEXT:    mvn x11, x8
+; CHECK-GI-NEXT:    lsl x2, x11, x10
+; CHECK-GI-NEXT:    lsl x1, x18, x10
+; CHECK-GI-NEXT:    csel x13, x14, x13, eq
+; CHECK-GI-NEXT:    lsl x14, x18, x16
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    mvn x10, x8
 ; CHECK-GI-NEXT:    orr x16, x17, x2
 ; CHECK-GI-NEXT:    csel x17, x1, xzr, lo
-; CHECK-GI-NEXT:    csel x15, x16, x15, lo
-; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    sub x11, x14, x10
-; CHECK-GI-NEXT:    sub x16, x10, #64
-; CHECK-GI-NEXT:    lsr x18, x4, x10
-; CHECK-GI-NEXT:    lsl x11, x5, x11
-; CHECK-GI-NEXT:    csel x13, x13, x15, eq
-; CHECK-GI-NEXT:    lsr x15, x5, x16
+; CHECK-GI-NEXT:    csel x14, x16, x14, lo
+; CHECK-GI-NEXT:    tst x10, #0x7f
+; CHECK-GI-NEXT:    sub x10, x12, x15
+; CHECK-GI-NEXT:    sub x16, x15, #64
+; CHECK-GI-NEXT:    lsr x18, x4, x15
+; CHECK-GI-NEXT:    lsl x10, x5, x10
+; CHECK-GI-NEXT:    csel x11, x11, x14, eq
+; CHECK-GI-NEXT:    lsr x14, x5, x16
 ; CHECK-GI-NEXT:    and x1, x8, #0x7f
-; CHECK-GI-NEXT:    orr x11, x18, x11
-; CHECK-GI-NEXT:    cmp x10, #64
-; CHECK-GI-NEXT:    lsr x16, x5, x10
-; CHECK-GI-NEXT:    csel x11, x11, x15, lo
+; CHECK-GI-NEXT:    cmp x15, #64
+; CHECK-GI-NEXT:    lsr x16, x5, x15
+; CHECK-GI-NEXT:    orr x10, x18, x10
+; CHECK-GI-NEXT:    csel x10, x10, x14, lo
 ; CHECK-GI-NEXT:    tst x9, #0x7f
-; CHECK-GI-NEXT:    sub x9, x14, x1
-; CHECK-GI-NEXT:    sub x14, x1, #64
-; CHECK-GI-NEXT:    lsr x15, x6, x1
+; CHECK-GI-NEXT:    sub x9, x12, x1
+; CHECK-GI-NEXT:    sub x12, x1, #64
+; CHECK-GI-NEXT:    lsr x14, x6, x1
 ; CHECK-GI-NEXT:    lsl x9, x7, x9
-; CHECK-GI-NEXT:    csel x11, x4, x11, eq
-; CHECK-GI-NEXT:    cmp x10, #64
-; CHECK-GI-NEXT:    lsr x10, x7, x14
-; CHECK-GI-NEXT:    csel x14, x16, xzr, lo
-; CHECK-GI-NEXT:    orr x9, x15, x9
+; CHECK-GI-NEXT:    csel x10, x4, x10, eq
+; CHECK-GI-NEXT:    cmp x15, #64
+; CHECK-GI-NEXT:    lsr x12, x7, x12
+; CHECK-GI-NEXT:    csel x15, x16, xzr, lo
+; CHECK-GI-NEXT:    orr x9, x14, x9
 ; CHECK-GI-NEXT:    cmp x1, #64
-; CHECK-GI-NEXT:    lsr x15, x7, x1
-; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    lsr x14, x7, x1
+; CHECK-GI-NEXT:    csel x9, x9, x12, lo
 ; CHECK-GI-NEXT:    tst x8, #0x7f
 ; CHECK-GI-NEXT:    csel x8, x6, x9, eq
 ; CHECK-GI-NEXT:    cmp x1, #64
-; CHECK-GI-NEXT:    orr x0, x0, x11
-; CHECK-GI-NEXT:    csel x9, x15, xzr, lo
-; CHECK-GI-NEXT:    orr x1, x12, x14
+; CHECK-GI-NEXT:    orr x0, x0, x10
+; CHECK-GI-NEXT:    csel x9, x14, xzr, lo
+; CHECK-GI-NEXT:    orr x1, x13, x15
 ; CHECK-GI-NEXT:    orr x2, x17, x8
-; CHECK-GI-NEXT:    orr x3, x13, x9
+; CHECK-GI-NEXT:    orr x3, x11, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
@@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: rotl_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x1, #3
-; CHECK-GI-NEXT:    lsl x10, x3, #3
-; CHECK-GI-NEXT:    lsr x11, x3, #61
-; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    extr x8, x0, x1, #61
+; CHECK-GI-NEXT:    extr x9, x3, x2, #61
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x2, x2, x3, #61
 ; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mov x3, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: rotr_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x3, #61
-; CHECK-GI-NEXT:    lsl x10, x0, #61
-; CHECK-GI-NEXT:    lsl x11, x2, #61
-; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT:    orr x2, x9, x2, lsr #3
-; CHECK-GI-NEXT:    orr x1, x10, x1, lsr #3
-; CHECK-GI-NEXT:    orr x3, x11, x3, lsr #3
+; CHECK-GI-NEXT:    extr x8, x1, x0, #3
+; CHECK-GI-NEXT:    extr x9, x3, x2, #3
+; CHECK-GI-NEXT:    extr x1, x0, x1, #3
+; CHECK-GI-NEXT:    extr x3, x2, x3, #3
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mov x2, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
 ;
 ; CHECK-GI-LABEL: fshl_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x5, #61
-; CHECK-GI-NEXT:    lsl x9, x1, #3
-; CHECK-GI-NEXT:    lsl x10, x3, #3
-; CHECK-GI-NEXT:    lsr x11, x7, #61
-; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    extr x8, x0, x5, #61
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x3, x3, x2, #61
+; CHECK-GI-NEXT:    extr x2, x2, x7, #61
 ; CHECK-GI-NEXT:    mov x0, x8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4480,29 +4445,15 @@ entry:
 }
 
 define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
-; CHECK-SD-LABEL: fshr_v2i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x5, x4, #3
-; CHECK-SD-NEXT:    extr x9, x7, x6, #3
-; CHECK-SD-NEXT:    extr x1, x0, x5, #3
-; CHECK-SD-NEXT:    extr x3, x2, x7, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    mov x2, x9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fshr_v2i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x5, #61
-; CHECK-GI-NEXT:    lsl x9, x7, #61
-; CHECK-GI-NEXT:    lsr x10, x5, #3
-; CHECK-GI-NEXT:    lsr x11, x7, #3
-; CHECK-GI-NEXT:    orr x8, x8, x4, lsr #3
-; CHECK-GI-NEXT:    orr x9, x9, x6, lsr #3
-; CHECK-GI-NEXT:    orr x1, x10, x0, lsl #61
-; CHECK-GI-NEXT:    orr x3, x11, x2, lsl #61
-; CHECK-GI-NEXT:    mov x0, x8
-; CHECK-GI-NEXT:    mov x2, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fshr_v2i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x5, x4, #3
+; CHECK-NEXT:    extr x9, x7, x6, #3
+; CHECK-NEXT:    extr x1, x0, x5, #3
+; CHECK-NEXT:    extr x3, x2, x7, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    mov x2, x9
+; CHECK-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>)
   ret <2 x i128> %d
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index f9fd2ad..90fb102 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ;
 ; CHECK-GI-LABEL: fshl_i128:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
 ; CHECK-GI-NEXT:    and x9, x4, #0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x14, x3, #63
-; CHECK-GI-NEXT:    sub x12, x10, x9
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    sub x12, x8, x9
 ; CHECK-GI-NEXT:    lsl x13, x1, x9
-; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    bic x10, x10, x4
 ; CHECK-GI-NEXT:    lsr x12, x0, x12
-; CHECK-GI-NEXT:    bic x8, x8, x4
-; CHECK-GI-NEXT:    sub x15, x9, #64
+; CHECK-GI-NEXT:    sub x14, x9, #64
+; CHECK-GI-NEXT:    lsl x15, x0, x9
+; CHECK-GI-NEXT:    extr x16, x3, x2, #1
 ; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x9, x0, x9
-; CHECK-GI-NEXT:    lsl x15, x0, x15
-; CHECK-GI-NEXT:    orr x12, x12, x13
-; CHECK-GI-NEXT:    orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT:    lsr x14, x3, #1
-; CHECK-GI-NEXT:    sub x10, x10, x8
-; CHECK-GI-NEXT:    sub x16, x8, #64
-; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
-; CHECK-GI-NEXT:    lsr x17, x13, x8
-; CHECK-GI-NEXT:    lsl x10, x14, x10
-; CHECK-GI-NEXT:    csel x12, x12, x15, lo
+; CHECK-GI-NEXT:    sub x8, x8, x10
+; CHECK-GI-NEXT:    orr x9, x12, x13
+; CHECK-GI-NEXT:    lsr x12, x3, #1
+; CHECK-GI-NEXT:    lsl x13, x0, x14
+; CHECK-GI-NEXT:    csel x14, x15, xzr, lo
+; CHECK-GI-NEXT:    sub x15, x10, #64
+; CHECK-GI-NEXT:    lsr x17, x16, x10
+; CHECK-GI-NEXT:    lsl x8, x12, x8
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    lsr x15, x14, x16
+; CHECK-GI-NEXT:    lsr x13, x12, x15
 ; CHECK-GI-NEXT:    mvn x11, x4
-; CHECK-GI-NEXT:    csel x12, x1, x12, eq
-; CHECK-GI-NEXT:    orr x10, x17, x10
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    lsr x14, x14, x8
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    csel x9, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x17, x8
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x12, x12, x10
+; CHECK-GI-NEXT:    csel x8, x8, x13, lo
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    csel x10, x13, x10, eq
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    csel x8, x14, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x10
-; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    csel x8, x16, x8, eq
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    csel x10, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x14, x8
+; CHECK-GI-NEXT:    orr x1, x9, x10
 ; CHECK-GI-NEXT:    ret
   %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
   ret i128 %f
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1cb92e4..87b1108 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) {
 ; CHECK-GI-NEXT:    add x8, x8, x10
 ; CHECK-GI-NEXT:    subs x10, x0, x9
 ; CHECK-GI-NEXT:    sbc x11, x1, x8
-; CHECK-GI-NEXT:    lsl x12, x11, #63
+; CHECK-GI-NEXT:    extr x10, x11, x10, #1
 ; CHECK-GI-NEXT:    lsr x11, x11, #1
-; CHECK-GI-NEXT:    orr x10, x12, x10, lsr #1
 ; CHECK-GI-NEXT:    adds x9, x10, x9
+; CHECK-GI-NEXT:    mov w10, #7 // =0x7
 ; CHECK-GI-NEXT:    adc x8, x11, x8
-; CHECK-GI-NEXT:    lsl x10, x8, #62
+; CHECK-GI-NEXT:    extr x9, x8, x9, #2
 ; CHECK-GI-NEXT:    lsr x8, x8, #2
-; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #2
-; CHECK-GI-NEXT:    mov w10, #7 // =0x7
-; CHECK-GI-NEXT:    lsl x12, x8, #3
 ; CHECK-GI-NEXT:    umulh x10, x9, x10
 ; CHECK-GI-NEXT:    lsl x11, x9, #3
-; CHECK-GI-NEXT:    sub x8, x12, x8
+; CHECK-GI-NEXT:    lsl x12, x8, #3
 ; CHECK-GI-NEXT:    sub x9, x11, x9
+; CHECK-GI-NEXT:    sub x8, x12, x8
 ; CHECK-GI-NEXT:    subs x0, x0, x9
 ; CHECK-GI-NEXT:    add x8, x8, x10
 ; CHECK-GI-NEXT:    sbc x1, x1, x8
@@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) {
 ; CHECK-GI-NEXT:    add x10, x11, x12
 ; CHECK-GI-NEXT:    add x8, x8, x14
 ; CHECK-GI-NEXT:    add x8, x8, x10
-; CHECK-GI-NEXT:    lsl x10, x8, #60
-; CHECK-GI-NEXT:    lsr x8, x8, #4
-; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #4
 ; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    extr x9, x8, x9, #4
+; CHECK-GI-NEXT:    lsr x8, x8, #4
 ; CHECK-GI-NEXT:    umulh x11, x9, x10
 ; CHECK-GI-NEXT:    mul x9, x9, x10
 ; CHECK-GI-NEXT:    madd x8, x8, x10, x11
@@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI-NEXT:    sbc x14, x1, x12
 ; CHECK-GI-NEXT:    add x8, x8, x13
 ; CHECK-GI-NEXT:    subs x13, x2, x10
-; CHECK-GI-NEXT:    lsl x15, x14, #63
-; CHECK-GI-NEXT:    sbc x16, x3, x8
+; CHECK-GI-NEXT:    extr x9, x14, x9, #1
+; CHECK-GI-NEXT:    sbc x15, x3, x8
 ; CHECK-GI-NEXT:    lsr x14, x14, #1
-; CHECK-GI-NEXT:    orr x9, x15, x9, lsr #1
-; CHECK-GI-NEXT:    lsl x15, x16, #63
-; CHECK-GI-NEXT:    orr x13, x15, x13, lsr #1
+; CHECK-GI-NEXT:    extr x13, x15, x13, #1
 ; CHECK-GI-NEXT:    adds x9, x9, x11
-; CHECK-GI-NEXT:    lsr x11, x16, #1
+; CHECK-GI-NEXT:    lsr x11, x15, #1
 ; CHECK-GI-NEXT:    adc x12, x14, x12
 ; CHECK-GI-NEXT:    adds x10, x13, x10
-; CHECK-GI-NEXT:    lsl x13, x12, #62
-; CHECK-GI-NEXT:    lsr x12, x12, #2
-; CHECK-GI-NEXT:    adc x8, x11, x8
-; CHECK-GI-NEXT:    lsl x11, x8, #62
-; CHECK-GI-NEXT:    orr x9, x13, x9, lsr #2
+; CHECK-GI-NEXT:    extr x9, x12, x9, #2
 ; CHECK-GI-NEXT:    mov w13, #7 // =0x7
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsr x11, x12, #2
+; CHECK-GI-NEXT:    extr x10, x8, x10, #2
+; CHECK-GI-NEXT:    umulh x12, x9, x13
 ; CHECK-GI-NEXT:    lsr x8, x8, #2
-; CHECK-GI-NEXT:    lsl x14, x12, #3
-; CHECK-GI-NEXT:    orr x10, x11, x10, lsr #2
-; CHECK-GI-NEXT:    umulh x11, x9, x13
+; CHECK-GI-NEXT:    lsl x14, x11, #3
 ; CHECK-GI-NEXT:    lsl x15, x9, #3
-; CHECK-GI-NEXT:    sub x12, x14, x12
-; CHECK-GI-NEXT:    lsl x16, x8, #3
 ; CHECK-GI-NEXT:    umulh x13, x10, x13
+; CHECK-GI-NEXT:    lsl x16, x8, #3
+; CHECK-GI-NEXT:    sub x11, x14, x11
 ; CHECK-GI-NEXT:    lsl x14, x10, #3
 ; CHECK-GI-NEXT:    sub x9, x15, x9
 ; CHECK-GI-NEXT:    sub x8, x16, x8
 ; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    add x11, x11, x12
 ; CHECK-GI-NEXT:    sub x10, x14, x10
-; CHECK-GI-NEXT:    add x11, x12, x11
 ; CHECK-GI-NEXT:    sbc x1, x1, x11
 ; CHECK-GI-NEXT:    subs x2, x2, x10
 ; CHECK-GI-NEXT:    add x8, x8, x13
@@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov x10, #23593 // =0x5c29
 ; CHECK-GI-NEXT:    mov x8, #62914 // =0xf5c2
-; CHECK-GI-NEXT:    sub x18, x0, x0
+; CHECK-GI-NEXT:    and x5, xzr, #0x1
 ; CHECK-GI-NEXT:    movk x10, #49807, lsl #16
 ; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    umulh x18, x0, xzr
 ; CHECK-GI-NEXT:    movk x10, #10485, lsl #32
 ; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
 ; CHECK-GI-NEXT:    movk x10, #36700, lsl #48
@@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI-NEXT:    umulh x15, x1, x10
 ; CHECK-GI-NEXT:    cset w12, hs
 ; CHECK-GI-NEXT:    cmn x11, x13
-; CHECK-GI-NEXT:    and x11, x12, #0x1
-; CHECK-GI-NEXT:    umulh x16, x0, x8
-; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    sub x13, x0, x0
 ; CHECK-GI-NEXT:    and x12, x12, #0x1
-; CHECK-GI-NEXT:    add x14, x14, x18
-; CHECK-GI-NEXT:    add x11, x11, x12
-; CHECK-GI-NEXT:    and x12, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x16, x0, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    add x13, x14, x13
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    and x14, xzr, #0x1
 ; CHECK-GI-NEXT:    umulh x9, xzr, x10
-; CHECK-GI-NEXT:    adds x14, x14, x15
-; CHECK-GI-NEXT:    and x15, xzr, #0x1
+; CHECK-GI-NEXT:    add x11, x12, x11
+; CHECK-GI-NEXT:    add x12, x5, x14
+; CHECK-GI-NEXT:    adds x13, x13, x15
 ; CHECK-GI-NEXT:    umulh x17, x1, x8
-; CHECK-GI-NEXT:    cset w4, hs
-; CHECK-GI-NEXT:    add x15, x12, x15
-; CHECK-GI-NEXT:    adds x12, x14, x16
-; CHECK-GI-NEXT:    and x4, x4, #0x1
-; CHECK-GI-NEXT:    mul x18, x3, x10
 ; CHECK-GI-NEXT:    cset w14, hs
-; CHECK-GI-NEXT:    adds x12, x12, x11
-; CHECK-GI-NEXT:    add x11, x15, x4
 ; CHECK-GI-NEXT:    and x14, x14, #0x1
-; CHECK-GI-NEXT:    cset w15, hs
-; CHECK-GI-NEXT:    mul x5, x2, x8
-; CHECK-GI-NEXT:    add x11, x11, x14
-; CHECK-GI-NEXT:    and x14, x15, #0x1
-; CHECK-GI-NEXT:    add x17, x9, x17
-; CHECK-GI-NEXT:    add x14, x11, x14
-; CHECK-GI-NEXT:    mov w11, #100 // =0x64
-; CHECK-GI-NEXT:    umulh x13, x0, xzr
-; CHECK-GI-NEXT:    umulh x16, x2, x10
-; CHECK-GI-NEXT:    adds x18, x18, x5
-; CHECK-GI-NEXT:    mul x15, x3, x8
-; CHECK-GI-NEXT:    add x13, x17, x13
-; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    umulh x10, x3, x10
-; CHECK-GI-NEXT:    add x13, x13, x14
-; CHECK-GI-NEXT:    and x17, x17, #0x1
-; CHECK-GI-NEXT:    cmn x18, x16
-; CHECK-GI-NEXT:    sub x18, x2, x2
-; CHECK-GI-NEXT:    umulh x16, x2, x8
+; CHECK-GI-NEXT:    adds x13, x13, x16
+; CHECK-GI-NEXT:    mul x4, x3, x10
+; CHECK-GI-NEXT:    add x12, x12, x14
 ; CHECK-GI-NEXT:    cset w14, hs
-; CHECK-GI-NEXT:    and x14, x14, #0x1
-; CHECK-GI-NEXT:    add x15, x15, x18
+; CHECK-GI-NEXT:    adds x11, x13, x11
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    mul x15, x2, x8
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    add x14, x9, x17
+; CHECK-GI-NEXT:    sub x17, x2, x2
+; CHECK-GI-NEXT:    umulh x16, x2, x10
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    add x13, x14, x18
+; CHECK-GI-NEXT:    add x12, x13, x12
 ; CHECK-GI-NEXT:    and x18, xzr, #0x1
-; CHECK-GI-NEXT:    add x14, x17, x14
+; CHECK-GI-NEXT:    mul x5, x3, x8
+; CHECK-GI-NEXT:    extr x11, x12, x11, #4
+; CHECK-GI-NEXT:    adds x13, x4, x15
+; CHECK-GI-NEXT:    umulh x14, x3, x10
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    cmn x13, x16
+; CHECK-GI-NEXT:    and x15, x15, #0x1
+; CHECK-GI-NEXT:    umulh x13, x2, x8
+; CHECK-GI-NEXT:    cset w16, hs
+; CHECK-GI-NEXT:    add x17, x5, x17
+; CHECK-GI-NEXT:    and x16, x16, #0x1
 ; CHECK-GI-NEXT:    umulh x8, x3, x8
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    adds x14, x17, x14
 ; CHECK-GI-NEXT:    and x17, xzr, #0x1
-; CHECK-GI-NEXT:    adds x10, x15, x10
-; CHECK-GI-NEXT:    add x15, x17, x18
+; CHECK-GI-NEXT:    add x16, x18, x17
 ; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    umulh x18, x2, xzr
+; CHECK-GI-NEXT:    adds x13, x14, x13
+; CHECK-GI-NEXT:    umulh x14, x2, xzr
 ; CHECK-GI-NEXT:    and x17, x17, #0x1
-; CHECK-GI-NEXT:    adds x10, x10, x16
-; CHECK-GI-NEXT:    lsl x16, x13, #60
-; CHECK-GI-NEXT:    add x15, x15, x17
-; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    adds x10, x10, x14
-; CHECK-GI-NEXT:    and x14, x17, #0x1
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    adds x13, x13, x15
+; CHECK-GI-NEXT:    add x15, x16, x17
+; CHECK-GI-NEXT:    and x16, x18, #0x1
 ; CHECK-GI-NEXT:    cset w17, hs
 ; CHECK-GI-NEXT:    add x8, x9, x8
-; CHECK-GI-NEXT:    add x14, x15, x14
-; CHECK-GI-NEXT:    and x15, x17, #0x1
-; CHECK-GI-NEXT:    orr x12, x16, x12, lsr #4
-; CHECK-GI-NEXT:    add x9, x14, x15
-; CHECK-GI-NEXT:    add x8, x8, x18
-; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    lsr x9, x13, #4
-; CHECK-GI-NEXT:    umulh x14, x12, x11
-; CHECK-GI-NEXT:    lsl x13, x8, #60
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    and x16, x17, #0x1
+; CHECK-GI-NEXT:    lsr x9, x12, #4
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    umulh x17, x11, x10
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x15
+; CHECK-GI-NEXT:    mul x11, x11, x10
+; CHECK-GI-NEXT:    extr x12, x8, x13, #4
 ; CHECK-GI-NEXT:    lsr x8, x8, #4
-; CHECK-GI-NEXT:    mul x12, x12, x11
-; CHECK-GI-NEXT:    orr x10, x13, x10, lsr #4
-; CHECK-GI-NEXT:    madd x9, x9, x11, x14
-; CHECK-GI-NEXT:    umulh x13, x10, x11
-; CHECK-GI-NEXT:    subs x0, x0, x12
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    madd x9, x9, x10, x17
+; CHECK-GI-NEXT:    umulh x13, x12, x10
+; CHECK-GI-NEXT:    subs x0, x0, x11
+; CHECK-GI-NEXT:    mul x12, x12, x10
 ; CHECK-GI-NEXT:    sbc x1, x1, x9
-; CHECK-GI-NEXT:    madd x8, x8, x11, x13
-; CHECK-GI-NEXT:    subs x2, x2, x10
+; CHECK-GI-NEXT:    madd x8, x8, x10, x13
+; CHECK-GI-NEXT:    subs x2, x2, x12
 ; CHECK-GI-NEXT:    sbc x3, x3, x8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 2583a93..5b81f5d 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -426,3 +426,21 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin
   call void %callee()
   ret void
 }
+
+define void @disable_tailcallopt(ptr %callee) "aarch64_inout_zt0" nounwind {
+; CHECK-COMMON-LABEL: disable_tailcallopt:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x19, sp
+; CHECK-COMMON-NEXT:    str zt0, [x19]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x0
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x19]
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+  tail call void %callee()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
index 221e2fd..09e1fca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
@@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX6-NEXT:    s_mov_b32 s5, s7
 ; GFX6-NEXT:    s_mov_b32 s6, s8
 ; GFX6-NEXT:    s_mov_b32 s7, s9
-; GFX6-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX6-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX8-NEXT:    s_mov_b32 s5, s7
 ; GFX8-NEXT:    s_mov_b32 s6, s8
 ; GFX8-NEXT:    s_mov_b32 s7, s9
-; GFX8-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX8-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX900-NEXT:    s_mov_b32 s5, s7
 ; GFX900-NEXT:    s_mov_b32 s6, s8
 ; GFX900-NEXT:    s_mov_b32 s7, s9
-; GFX900-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX900-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s8
 ; GFX90A-NEXT:    s_mov_b32 s7, s9
-; GFX90A-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
 ; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX6-NEXT:    s_mov_b32 s5, s7
 ; GFX6-NEXT:    s_mov_b32 s6, s8
 ; GFX6-NEXT:    s_mov_b32 s7, s9
-; GFX6-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX6-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX8-NEXT:    s_mov_b32 s5, s7
 ; GFX8-NEXT:    s_mov_b32 s6, s8
 ; GFX8-NEXT:    s_mov_b32 s7, s9
-; GFX8-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX8-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX900-NEXT:    s_mov_b32 s5, s7
 ; GFX900-NEXT:    s_mov_b32 s6, s8
 ; GFX900-NEXT:    s_mov_b32 s7, s9
-; GFX900-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX900-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s8
 ; GFX90A-NEXT:    s_mov_b32 s7, s9
-; GFX90A-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
 ; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
index 292fa4b..4f160b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
@@ -25,6 +25,7 @@ body: |
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si]].sub0
     ; GFX6-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -35,6 +36,7 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi]].sub0
     ; GFX8-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -45,6 +47,7 @@ body: |
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_]].sub0
     ; GFX10-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -55,6 +58,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_]].sub0
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY3]]
     ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i32_1d
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX12-NEXT: {{  $}}
@@ -89,39 +93,43 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX6-NEXT: S_ENDPGM 0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX8-NEXT: S_ENDPGM 0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX10-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i32_1d_no_return
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
     ; GFX12-NEXT: S_ENDPGM 0
     %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     %1:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
@@ -150,6 +158,7 @@ body: |
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si]].sub0_sub1
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX8-NEXT: {{  $}}
@@ -160,6 +169,7 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi]].sub0_sub1
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX10-NEXT: {{  $}}
@@ -170,6 +180,7 @@ body: |
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_]].sub0_sub1
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -180,6 +191,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_]].sub0_sub1
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
     ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i64_1d
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX12-NEXT: {{  $}}
@@ -214,39 +226,43 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX6-NEXT: S_ENDPGM 0
+    ;
     ; GFX8-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX8-NEXT: S_ENDPGM 0
+    ;
     ; GFX10-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX10-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: atomic_cmpswap_i64_1d_no_return
     ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+    ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
     ; GFX12-NEXT: S_ENDPGM 0
     %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     %1:vgpr(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index 6c4f504..33ce278 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1)
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1)
 ; PASS-CHECK-NEXT:    br label %[[WHILE:.*]]
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]]
 ; PASS-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac
 ; PASS-CHECK-NEXT:    br label %[[WHILE:.*]]
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
@@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
@@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
 ; PASS-CHECK-NEXT:    [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
 ; PASS-CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace
 ; PASS-CHECK-NEXT:    br label %[[WHILE:.*]]
 ; PASS-CHECK:       [[WHILE]]:
 ; PASS-CHECK-NEXT:    [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]])
+; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]]
 ; PASS-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
 ; PASS-CHECK:       [[IF]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index aa11574..a3e42e5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
 ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; PASS-CHECK-NEXT:    [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
+; PASS-CHECK-NEXT:    [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
 ; PASS-CHECK-NEXT:    ret void
 ;
@@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
 ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; PASS-CHECK-NEXT:    [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]])
+; PASS-CHECK-NEXT:    [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
 ; PASS-CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
index 49607e3..83f0229 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
@@ -92,8 +92,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a0
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a0
-; GFX90A-NEXT:    image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm
 ; GFX90A-NEXT:    s_endpgm
   %data = call i32 asm "; def $0", "=a"()
   %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -106,8 +105,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a0
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT:    image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT:    image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm
 ; GFX90A-NEXT:    s_endpgm
   %data = call i32 asm "; def $0", "=a"()
   %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -123,9 +121,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a1
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm
 ; GFX90A-NEXT:    s_endpgm
   %cmp = call i32 asm "; def $0", "=a"()
   %swap = call i32 asm "; def $0", "=a"()
@@ -139,9 +135,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT:    image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT:    image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm
 ; GFX90A-NEXT:    s_endpgm
   %data = call i64 asm "; def $0", "=a"()
   %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -154,14 +148,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def a[0:1]
+; GFX90A-NEXT:    ; def a[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT:    image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT:    image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm
 ; GFX90A-NEXT:    s_endpgm
   %cmp = call i64 asm "; def $0", "=a"()
   %swap = call i64 asm "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll
new file mode 100644
index 0000000..6c58a1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll
@@ -0,0 +1,581 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS-GISE %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISE %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+
+define amdgpu_ps void @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d_i64:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d_i64:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d_i64:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d_i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_float(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d_float:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d_float:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d_float:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d_float:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call float @llvm.amdgcn.image.atomic.swap.1d.f32.i32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_sub_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_sub_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_sub_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_smin_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_smin_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_smin_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_smin_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_umin_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_umin_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_umin_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_smax_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_smax_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_smax_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_smax_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_umax_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_umax_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_umax_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_and_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_and_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_and_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_or_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_or_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_or_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_xor_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_xor_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_xor_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_inc_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_inc_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_inc_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_dec_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_dec_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_dec_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_cmpswap_1d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_cmpswap_1d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpswap_1d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d_64:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_cmpswap_1d_64:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_cmpswap_1d_64:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpswap_1d_64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+  %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
+; GFX10PLUS-GISE-LABEL: atomic_add_3d:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_3d:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_3d:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_3d:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
+; GFX10PLUS-GISE-LABEL: atomic_add_cube:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_cube:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_cube:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_cube:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1darray:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1darray:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1darray:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_1darray:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2darray:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2darray:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2darray:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2darray:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2dmsaa:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2dmsaa:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2dmsaa:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2dmsaa:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2darraymsaa:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2darraymsaa:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2darraymsaa:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_2darraymsaa:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1d_slc:
+; GFX10PLUS-GISE:       ; %bb.0:
+; GFX10PLUS-GISE-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc
+; GFX10PLUS-GISE-NEXT:    s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1d_slc:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1d_slc:
+; GFX12-GISE:       ; %bb.0:
+; GFX12-GISE-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+; GFX12-GISE-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_1d_slc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+; GFX12-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 3d1d6c8..0ba62e4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -41,15 +41,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -79,15 +77,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -126,15 +122,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -173,15 +167,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
@@ -192,15 +184,13 @@ main_body:
 define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
index 7a876f6..3544017 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
@@ -76,6 +76,20 @@ entry:
   ret i32 %ret
 }
 
+define noundef i32 @wave_reduce_min(i32 noundef %x) {
+entry:
+  ; CHECK: Function wave_reduce_min : [[WAVE_FLAG]]
+  %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %x)
+  ret i32 %ret
+}
+
+define noundef i32 @wave_reduce_umin(i32 noundef %x) {
+entry:
+  ; CHECK: Function wave_reduce_umin : [[WAVE_FLAG]]
+  %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %x)
+  ret i32 %ret
+}
+
 define void @wave_active_countbits(i1 %expr) {
 entry:
   ; CHECK: Function wave_active_countbits : [[WAVE_FLAG]]
diff --git a/llvm/test/CodeGen/DirectX/WaveActiveMin.ll b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll
new file mode 100644
index 0000000..24fde48
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll
@@ -0,0 +1,143 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s
+
+; Test that for scalar values, WaveActiveMin maps down to the DirectX op
+
+define noundef half @wave_active_min_half(half noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 2, i8 0){{$}}
+  %ret = call half @llvm.dx.wave.reduce.min.f16(half %expr)
+  ret half %ret
+}
+
+define noundef float @wave_active_min_float(float noundef %expr) {
+entry:
+; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 2, i8 0){{$}}
+  %ret = call float @llvm.dx.wave.reduce.min.f32(float %expr)
+  ret float %ret
+}
+
+define noundef double @wave_active_min_double(double noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 2, i8 0){{$}}
+  %ret = call double @llvm.dx.wave.reduce.min.f64(double %expr)
+  ret double %ret
+}
+
+define noundef i16 @wave_active_min_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 0){{$}}
+  %ret = call i16 @llvm.dx.wave.reduce.min.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_min_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 0){{$}}
+  %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_min_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 0){{$}}
+  %ret = call i64 @llvm.dx.wave.reduce.min.i64(i64 %expr)
+  ret i64 %ret
+}
+
+define noundef i16 @wave_active_umin_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 1){{$}}
+  %ret = call i16 @llvm.dx.wave.reduce.umin.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_umin_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 1){{$}}
+  %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_umin_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 1){{$}}
+  %ret = call i64 @llvm.dx.wave.reduce.umin.i64(i64 %expr)
+  ret i64 %ret
+}
+
+declare half @llvm.dx.wave.reduce.min.f16(half)
+declare float @llvm.dx.wave.reduce.min.f32(float)
+declare double @llvm.dx.wave.reduce.min.f64(double)
+
+declare i16 @llvm.dx.wave.reduce.min.i16(i16)
+declare i32 @llvm.dx.wave.reduce.min.i32(i32)
+declare i64 @llvm.dx.wave.reduce.min.i64(i64)
+
+declare i16 @llvm.dx.wave.reduce.umin.i16(i16)
+declare i32 @llvm.dx.wave.reduce.umin.i32(i32)
+declare i64 @llvm.dx.wave.reduce.umin.i64(i64)
+
+; Test that for vector values, WaveActiveMin scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @wave_active_min_v2half(<2 x half> noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 2, i8 0){{$}}
+  %ret = call <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half> %expr)
+  ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @wave_active_min_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 0){{$}}
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @wave_active_min_v4f64(<4 x double> noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 2, i8 0){{$}}
+  %ret = call <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double> %expr)
+  ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half>)
+declare <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32>)
+declare <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double>)
+
+define noundef <2 x i16> @wave_active_umin_v2i16(<2 x i16> noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 2, i8 1){{$}}
+  %ret = call <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16> %expr)
+  ret <2 x i16> %ret
+}
+
+define noundef <3 x i32> @wave_active_umin_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 1){{$}}
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x i64> @wave_active_umin_v4f64(<4 x i64> noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 2, i8 1){{$}}
+  %ret = call <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64> %expr)
+  ret <4 x i64> %ret
+}
+
+declare <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16>)
+declare <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32>)
+declare <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64>)
diff --git a/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll
new file mode 100644
index 0000000..fcf1246
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/inst_masked_store_bug1.ll
@@ -0,0 +1,94 @@
+;; REQUIRES: asserts
+;; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b %s -o - | FileCheck %s
+;; Sanity check for lowering masked scatter without assertion errors.
+
+define void @outer_product(ptr  %aptr, ptr  %bptr, ptr %cptr, i32 %T, i32 %W) {
+entry:
+  %W.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %W, i64 0
+  %W.ripple.bcast.splat = shufflevector <8 x i32> %W.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %div1194 = lshr i32 %T, 3
+  %cmp84.not = icmp ult i32 %T, 8
+  br i1 %cmp84.not, label %for.end49, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %div10195 = lshr i32 %W, 3
+  %cmp1782.not = icmp ult i32 %W, 8
+  %arrayidx27.ripple.LS.dim.slope = mul <8 x i32> %W.ripple.bcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %arrayidx27.ripple.LS.dim.slope.ripple.bcast = shufflevector <8 x i32> %arrayidx27.ripple.LS.dim.slope, <8 x i32> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx27.ripple.LS.slope = add <64 x i32> %arrayidx27.ripple.LS.dim.slope.ripple.bcast, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %invariant.gep196 = getelementptr i8, ptr %cptr, <64 x i32> %arrayidx27.ripple.LS.slope
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.preheader
+  %ripple.par.iv.085 = phi i32 [ %add48, %for.end ], [ 0, %for.body.preheader ]
+  %mul2 = shl i32 %ripple.par.iv.085, 3
+  br i1 %cmp1782.not, label %for.end, label %for.body18.lr.ph
+
+for.body18.lr.ph:                                 ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2
+  %mul25 = mul i32 %mul2, %W
+  %gep197 = getelementptr i8, <64 x ptr> %invariant.gep196, i32 %mul25
+  br label %for.body18
+
+for.body18:                                       ; preds = %for.body18, %for.body18.lr.ph
+  %ripple.par.iv15.083 = phi i32 [ 0, %for.body18.lr.ph ], [ %add28, %for.body18 ]
+  %mul19 = shl i32 %ripple.par.iv15.083, 3
+  %.ripple.LS.instance184 = load <8 x i8>, ptr %arrayidx, align 1
+  %.ripple.LS.instance184.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance184, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx21 = getelementptr inbounds nuw i8, ptr %bptr, i32 %mul19
+  %.ripple.LS.instance = load <8 x i8>, ptr %arrayidx21, align 1
+  %.ripple.LS.instance.ripple.bcast = shufflevector <8 x i8> %.ripple.LS.instance, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mul23.ripple.LS.instance = mul <64 x i8> %.ripple.LS.instance.ripple.bcast, %.ripple.LS.instance184.ripple.bcast
+  %gep = getelementptr i8, <64 x ptr> %gep197, i32 %mul19
+  tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul23.ripple.LS.instance, <64 x ptr> %gep, i32 1, <64 x i1> splat (i1 true))
+  %add28 = add nuw i32 %ripple.par.iv15.083, 1
+  %cmp17 = icmp ult i32 %add28, %div10195
+  br i1 %cmp17, label %for.body18, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body18
+  %0 = shl i32 %add28, 3
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.body
+  %ripple.par.iv15.0.lcssa = phi i32 [ 0, %for.body ], [ %0, %for.end.loopexit ]
+  %add30.ripple.bcast.splatinsert = insertelement <8 x i32> poison, i32 %ripple.par.iv15.0.lcssa, i64 0
+  %add30.ripple.bcast.splat = shufflevector <8 x i32> %add30.ripple.bcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %add30.ripple.LS.instance = or disjoint <8 x i32> %add30.ripple.bcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cmp32.ripple.LS.instance = icmp ne i32 %ripple.par.iv15.0.lcssa, %W
+  %cmp32.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <8 x i1> poison, i1 %cmp32.ripple.LS.instance, i64 0
+  %cmp32.ripple.LS.instance.ripple.bcast.splat = shufflevector <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splatinsert, <8 x i1> poison, <8 x i32> zeroinitializer
+  %cmp33.ripple.vectorized = icmp ult <8 x i32> %add30.ripple.LS.instance, %W.ripple.bcast.splat
+  %or.cond.ripple.LS.instance = select <8 x i1> %cmp32.ripple.LS.instance.ripple.bcast.splat, <8 x i1> %cmp33.ripple.vectorized, <8 x i1> zeroinitializer
+  %or.cond.ripple.LS.instance.ripple.bcast = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle = shufflevector <8 x i1> %or.cond.ripple.LS.instance, <8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 false>, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator = or <8 x i1> %or.cond.ripple.LS.instance, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, <8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 false, i1 false>, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 14, i32 15>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator190 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle189
+  %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191 = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, <8 x i1> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %or.cond.ripple.LS.instance.ripple.reducelog2.operator192 = or <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator190, %or.cond.ripple.LS.instance.ripple.reducelog2.shuffle191
+  %ripple.red.extract.ripple.bcast.splat = shufflevector <8 x i1> %or.cond.ripple.LS.instance.ripple.reducelog2.operator192, <8 x i1> poison, <8 x i32> zeroinitializer
+  %arrayidx34.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %aptr, i32 %mul2
+  %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx34.ripple.branch.clone, i32 1, <8 x i1> %ripple.red.extract.ripple.bcast.splat, <8 x i8> poison)
+  %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance188.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %arrayidx36.ripple.branch.clone = getelementptr inbounds nuw i8, ptr %bptr, i32 %ripple.par.iv15.0.lcssa
+  %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %arrayidx36.ripple.branch.clone, i32 1, <8 x i1> %or.cond.ripple.LS.instance, <8 x i8> poison)
+  %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone = shufflevector <8 x i8> %.ripple.LS.instance187.ripple.branch.clone.ripple.masked.load, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mul38.ripple.LS.instance.ripple.branch.clone = mul <64 x i8> %.ripple.LS.instance187.ripple.bcast.ripple.branch.clone, %.ripple.LS.instance188.ripple.bcast.ripple.branch.clone
+  %mul40.ripple.branch.clone = mul i32 %mul2, %W
+  %1 = getelementptr i8, ptr %cptr, i32 %mul40.ripple.branch.clone
+  %arrayidx42.ripple.branch.clone = getelementptr i8, ptr %1, i32 %ripple.par.iv15.0.lcssa
+  %arrayidx42.ripple.LS.instance.ripple.branch.clone = getelementptr i8, ptr %arrayidx42.ripple.branch.clone, <64 x i32> %arrayidx27.ripple.LS.slope
+  tail call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %mul38.ripple.LS.instance.ripple.branch.clone, <64 x ptr> %arrayidx42.ripple.LS.instance.ripple.branch.clone, i32 1, <64 x i1> %or.cond.ripple.LS.instance.ripple.bcast)
+  %add48 = add nuw i32 %ripple.par.iv.085, 1
+  %cmp = icmp ult i32 %add48, %div1194
+  br i1 %cmp, label %for.body, label %for.end49
+
+for.end49:                                        ; preds = %for.end, %entry
+  ret void
+}
+
+;; CHECK: outer_product
+;; CHECK: {{r[0-9]+}} = lsr({{r[0-9]+}},#3)
+;; CHECK: {{q[0-9]+}} = vand({{v[0-9]+}},{{r[0-9]+}})
+;; CHECK: {{v[0-9]+}} = vmux(q0,{{v[0-9]+}},{{v[0-9]+}})
+;; CHECK: vmem{{.*}} = {{v[0-9]+}}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
new file mode 100644
index 0000000..48ec98c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a2, 0
+; CHECK-NEXT:    xvld $xr1, $a1, 0
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 5
+; CHECK-NEXT:    xvpickve.w $xr3, $xr1, 5
+; CHECK-NEXT:    fmin.s $fa2, $fa3, $fa2
+; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 4
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 4
+; CHECK-NEXT:    fmin.s $fa3, $fa4, $fa3
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 6
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 6
+; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 7
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 7
+; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 48
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 1
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 1
+; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    xvpickve.w $xr4, $xr0, 0
+; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 0
+; CHECK-NEXT:    fmin.s $fa4, $fa5, $fa4
+; CHECK-NEXT:    vextrins.w $vr4, $vr2, 16
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 2
+; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 2
+; CHECK-NEXT:    fmin.s $fa2, $fa5, $fa2
+; CHECK-NEXT:    vextrins.w $vr4, $vr2, 32
+; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
+; CHECK-NEXT:    xvpickve.w $xr1, $xr1, 3
+; CHECK-NEXT:    fmin.s $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.w $vr4, $vr0, 48
+; CHECK-NEXT:    xvpermi.q $xr4, $xr3, 2
+; CHECK-NEXT:    xvst $xr4, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %x
+  %v1 = load <8 x float>, ptr %y
+  %r = call <8 x float> @llvm.minnum.v8f32(<8 x float> %v0, <8 x float> %v1)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a2, 0
+; CHECK-NEXT:    xvld $xr1, $a1, 0
+; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 3
+; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
+; CHECK-NEXT:    fmin.d $fa2, $fa3, $fa2
+; CHECK-NEXT:    xvpickve.d $xr3, $xr0, 2
+; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 2
+; CHECK-NEXT:    fmin.d $fa3, $fa4, $fa3
+; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
+; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 1
+; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 1
+; CHECK-NEXT:    fmin.d $fa2, $fa4, $fa2
+; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
+; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 0
+; CHECK-NEXT:    fmin.d $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %x
+  %v1 = load <4 x double>, ptr %y
+  %r = call <4 x double> @llvm.minnum.v4f64(<4 x double> %v0, <4 x double> %v1)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a2, 0
+; CHECK-NEXT:    xvld $xr1, $a1, 0
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 5
+; CHECK-NEXT:    xvpickve.w $xr3, $xr1, 5
+; CHECK-NEXT:    fmax.s $fa2, $fa3, $fa2
+; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 4
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 4
+; CHECK-NEXT:    fmax.s $fa3, $fa4, $fa3
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 6
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 6
+; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 7
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 7
+; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 48
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 1
+; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 1
+; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    xvpickve.w $xr4, $xr0, 0
+; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 0
+; CHECK-NEXT:    fmax.s $fa4, $fa5, $fa4
+; CHECK-NEXT:    vextrins.w $vr4, $vr2, 16
+; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 2
+; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 2
+; CHECK-NEXT:    fmax.s $fa2, $fa5, $fa2
+; CHECK-NEXT:    vextrins.w $vr4, $vr2, 32
+; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
+; CHECK-NEXT:    xvpickve.w $xr1, $xr1, 3
+; CHECK-NEXT:    fmax.s $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.w $vr4, $vr0, 48
+; CHECK-NEXT:    xvpermi.q $xr4, $xr3, 2
+; CHECK-NEXT:    xvst $xr4, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %x
+  %v1 = load <8 x float>, ptr %y
+  %r = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %v0, <8 x float> %v1)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a2, 0
+; CHECK-NEXT:    xvld $xr1, $a1, 0
+; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 3
+; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
+; CHECK-NEXT:    fmax.d $fa2, $fa3, $fa2
+; CHECK-NEXT:    xvpickve.d $xr3, $xr0, 2
+; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 2
+; CHECK-NEXT:    fmax.d $fa3, $fa4, $fa3
+; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
+; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 1
+; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 1
+; CHECK-NEXT:    fmax.d $fa2, $fa4, $fa2
+; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
+; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 0
+; CHECK-NEXT:    fmax.d $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %x
+  %v1 = load <4 x double>, ptr %y
+  %r = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %v0, <4 x double> %v1)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
new file mode 100644
index 0000000..27ecb75
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a2, 0
+; CHECK-NEXT:    vld $vr1, $a1, 0
+; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 1
+; CHECK-NEXT:    vreplvei.w $vr3, $vr1, 1
+; CHECK-NEXT:    fmin.s $fa2, $fa3, $fa2
+; CHECK-NEXT:    vreplvei.w $vr3, $vr0, 0
+; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 0
+; CHECK-NEXT:    fmin.s $fa3, $fa4, $fa3
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 2
+; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 2
+; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
+; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 3
+; CHECK-NEXT:    fmin.s $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
+; CHECK-NEXT:    vst $vr3, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %x
+  %v1 = load <4 x float>, ptr %y
+  %r = call <4 x float> @llvm.minnum.v4f32(<4 x float> %v0, <4 x float> %v1)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a2, 0
+; CHECK-NEXT:    vld $vr1, $a1, 0
+; CHECK-NEXT:    vreplvei.d $vr2, $vr0, 1
+; CHECK-NEXT:    vreplvei.d $vr3, $vr1, 1
+; CHECK-NEXT:    fmin.d $fa2, $fa3, $fa2
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
+; CHECK-NEXT:    fmin.d $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %x
+  %v1 = load <2 x double>, ptr %y
+  %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %v0, <2 x double> %v1)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a2, 0
+; CHECK-NEXT:    vld $vr1, $a1, 0
+; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 1
+; CHECK-NEXT:    vreplvei.w $vr3, $vr1, 1
+; CHECK-NEXT:    fmax.s $fa2, $fa3, $fa2
+; CHECK-NEXT:    vreplvei.w $vr3, $vr0, 0
+; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 0
+; CHECK-NEXT:    fmax.s $fa3, $fa4, $fa3
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 2
+; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 2
+; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
+; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 3
+; CHECK-NEXT:    fmax.s $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
+; CHECK-NEXT:    vst $vr3, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %x
+  %v1 = load <4 x float>, ptr %y
+  %r = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %v0, <4 x float> %v1)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a2, 0
+; CHECK-NEXT:    vld $vr1, $a1, 0
+; CHECK-NEXT:    vreplvei.d $vr2, $vr0, 1
+; CHECK-NEXT:    vreplvei.d $vr3, $vr1, 1
+; CHECK-NEXT:    fmax.d $fa2, $fa3, $fa2
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
+; CHECK-NEXT:    fmax.d $fa0, $fa1, $fa0
+; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %x
+  %v1 = load <2 x double>, ptr %y
+  %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %v0, <2 x double> %v1)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/PowerPC/bittest.ll b/llvm/test/CodeGen/PowerPC/bittest.ll
new file mode 100644
index 0000000..cba56e3
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/bittest.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs < %s -O3 -mcpu=ppc -mtriple powerpc-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names | FileCheck %s
+
+define i32 @foo(i32 noundef signext %x) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stwu r1, -64(r1)
+; CHECK-NEXT:    stw r0, 72(r1)
+; CHECK-NEXT:    cmpwi r3, 8
+; CHECK-NEXT:    stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT:    mr r31, r3
+; CHECK-NEXT:    li r3, 0
+; CHECK-NEXT:    ble cr0, L..BB0_4
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    cmpwi r31, 11
+; CHECK-NEXT:    bge cr0, L..BB0_7
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    cmplwi r31, 9
+; CHECK-NEXT:    beq cr0, L..BB0_9
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    cmplwi r31, 10
+; CHECK-NEXT:    beq cr0, L..BB0_11
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_4: # %entry
+; CHECK-NEXT:    cmplwi r31, 4
+; CHECK-NEXT:    beq cr0, L..BB0_12
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    cmplwi r31, 7
+; CHECK-NEXT:    beq cr0, L..BB0_11
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    cmplwi r31, 8
+; CHECK-NEXT:    beq cr0, L..BB0_10
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_7: # %entry
+; CHECK-NEXT:    beq cr0, L..BB0_10
+; CHECK-NEXT:  # %bb.8: # %entry
+; CHECK-NEXT:    cmplwi r31, 12
+; CHECK-NEXT:    bne cr0, L..BB0_13
+; CHECK-NEXT:  L..BB0_9: # %sw.bb2
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo3[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_10: # %sw.bb1
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo2[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_11: # %sw.bb
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo1[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    b L..BB0_13
+; CHECK-NEXT:  L..BB0_12: # %sw.bb3
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:    bl .foo4[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:  L..BB0_13: # %return
+; CHECK-NEXT:    lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    lwz r0, 8(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+entry:
+  switch i32 %x, label %return [
+  i32 7, label %sw.bb
+  i32 10, label %sw.bb
+  i32 8, label %sw.bb1
+  i32 11, label %sw.bb1
+  i32 9, label %sw.bb2
+  i32 12, label %sw.bb2
+  i32 4, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry
+  tail call void @foo1(i32 noundef signext %x)
+  br label %return
+
+sw.bb1:                                           ; preds = %entry, %entry
+  tail call void @foo2(i32 noundef signext %x)
+  br label %return
+
+sw.bb2:                                           ; preds = %entry, %entry
+  tail call void @foo3(i32 noundef signext %x)
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @foo4(i32 noundef signext 4)
+  br label %return
+
+return:                                           ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ]
+  ret i32 %retval.0
+}
+
+define i32 @goo(i32 noundef signext %x) {
+; CHECK-LABEL: goo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stwu r1, -64(r1)
+; CHECK-NEXT:    stw r0, 72(r1)
+; CHECK-NEXT:    cmplwi r3, 12
+; CHECK-NEXT:    stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT:    mr r31, r3
+; CHECK-NEXT:    bgt cr0, L..BB1_7
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    li r3, 1
+; CHECK-NEXT:    slw r3, r3, r31
+; CHECK-NEXT:    andi. r4, r3, 5632
+; CHECK-NEXT:    bne cr0, L..BB1_4
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    andi. r3, r3, 2304
+; CHECK-NEXT:    beq cr0, L..BB1_5
+; CHECK-NEXT:  # %bb.3: # %sw.bb1
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo2[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_4: # %sw.bb2
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    bl .foo3[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_5: # %entry
+; CHECK-NEXT:    cmplwi r31, 7
+; CHECK-NEXT:    bne cr0, L..BB1_7
+; CHECK-NEXT:  # %bb.6: # %sw.bb
+; CHECK-NEXT:    li r3, 7
+; CHECK-NEXT:    li r31, 7
+; CHECK-NEXT:    bl .foo1[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b L..BB1_9
+; CHECK-NEXT:  L..BB1_7: # %entry
+; CHECK-NEXT:    cmplwi r31, 4
+; CHECK-NEXT:    li r31, 0
+; CHECK-NEXT:    bne cr0, L..BB1_9
+; CHECK-NEXT:  # %bb.8: # %sw.bb3
+; CHECK-NEXT:    li r3, 4
+; CHECK-NEXT:    li r31, 4
+; CHECK-NEXT:    bl .foo4[PR]
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  L..BB1_9: # %return
+; CHECK-NEXT:    mr r3, r31
+; CHECK-NEXT:    lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    lwz r0, 8(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+entry:
+  switch i32 %x, label %return [
+  i32 7, label %sw.bb
+  i32 8, label %sw.bb1
+  i32 11, label %sw.bb1
+  i32 9, label %sw.bb2
+  i32 10, label %sw.bb2
+  i32 12, label %sw.bb2
+  i32 4, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  tail call void @foo1(i32 noundef signext 7)
+  br label %return
+
+sw.bb1:                                           ; preds = %entry, %entry
+  tail call void @foo2(i32 noundef signext %x)
+  br label %return
+
+sw.bb2:                                           ; preds = %entry, %entry, %entry
+  tail call void @foo3(i32 noundef signext %x)
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @foo4(i32 noundef signext 4)
+  br label %return
+
+return:                                           ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ 7, %sw.bb ]
+  ret i32 %retval.0
+}
+
+declare void @foo1(i32 noundef signext)
+
+declare void @foo2(i32 noundef signext)
+
+declare void @foo3(i32 noundef signext)
+
+declare void @foo4(i32 noundef signext)
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll
new file mode 100644
index 0000000..b43555c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC-PERM %s
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IB-COMMON,RV32IAB %s
+;
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IB-ZALRSC %s
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IB-ZALRSC-PERM %s
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IAB %s
+
+define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bge a3, a1, .LBB0_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB0_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB0_3: # in Loop: Header=BB0_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB0_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    max a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB0_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amomax.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bge a3, a2, .LBB0_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB0_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB0_3: # in Loop: Header=BB0_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB0_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    max a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB0_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomax.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw max ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bge a1, a3, .LBB1_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB1_3: # in Loop: Header=BB1_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB1_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    min a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB1_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amomin.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bge a2, a3, .LBB1_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB1_3: # in Loop: Header=BB1_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB1_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    min a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB1_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomin.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw min ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bgeu a3, a1, .LBB2_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB2_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB2_3: # in Loop: Header=BB2_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB2_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    maxu a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB2_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bgeu a3, a2, .LBB2_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB2_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB2_3: # in Loop: Header=BB2_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB2_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    maxu a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB2_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umax ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IB-ZALRSC:       # %bb.0:
+; RV32IB-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT:    mv a3, a2
+; RV32IB-ZALRSC-NEXT:    bgeu a1, a3, .LBB3_3
+; RV32IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    mv a3, a1
+; RV32IB-ZALRSC-NEXT:  .LBB3_3: # in Loop: Header=BB3_1 Depth=1
+; RV32IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT:    bnez a3, .LBB3_1
+; RV32IB-ZALRSC-NEXT:  # %bb.4:
+; RV32IB-ZALRSC-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-NEXT:    ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IB-ZALRSC-PERM:       # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    minu a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB3_1
+; RV32IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV32IAB-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IAB:       # %bb.0:
+; RV32IAB-NEXT:    amominu.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:    bgeu a2, a3, .LBB3_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:  .LBB3_3: # in Loop: Header=BB3_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB3_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:    sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    minu a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB3_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amominu.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umin ptr %a, i32 %b seq_cst
+  ret i32 %1
+}
+
+define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_max_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB4_2
+; RV32IB-COMMON-NEXT:  .LBB4_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB4_7
+; RV32IB-COMMON-NEXT:  .LBB4_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB4_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    slt a0, s0, a5
+; RV32IB-COMMON-NEXT:    j .LBB4_5
+; RV32IB-COMMON-NEXT:  .LBB4_4: # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, s2, a4
+; RV32IB-COMMON-NEXT:  .LBB4_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB4_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB4_1
+; RV32IB-COMMON-NEXT:  .LBB4_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bge a3, a1, .LBB4_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB4_3: # in Loop: Header=BB4_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB4_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    max a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB4_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomax.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw max ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
+
+define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_min_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB5_2
+; RV32IB-COMMON-NEXT:  .LBB5_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB5_7
+; RV32IB-COMMON-NEXT:  .LBB5_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB5_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    slt a0, a5, s0
+; RV32IB-COMMON-NEXT:    j .LBB5_5
+; RV32IB-COMMON-NEXT:  .LBB5_4: # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, a4, s2
+; RV32IB-COMMON-NEXT:  .LBB5_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB5_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB5_1
+; RV32IB-COMMON-NEXT:  .LBB5_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bge a1, a3, .LBB5_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB5_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB5_3: # in Loop: Header=BB5_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB5_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    min a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB5_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomin.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw min ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
+
+define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB6_2
+; RV32IB-COMMON-NEXT:  .LBB6_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB6_7
+; RV32IB-COMMON-NEXT:  .LBB6_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB6_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, s0, a5
+; RV32IB-COMMON-NEXT:    j .LBB6_5
+; RV32IB-COMMON-NEXT:  .LBB6_4: # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, s2, a4
+; RV32IB-COMMON-NEXT:  .LBB6_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB6_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB6_1
+; RV32IB-COMMON-NEXT:  .LBB6_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bgeu a3, a1, .LBB6_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB6_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB6_3: # in Loop: Header=BB6_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB6_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    maxu a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB6_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amomaxu.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umax ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
+
+define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV32IB-COMMON:       # %bb.0:
+; RV32IB-COMMON-NEXT:    addi sp, sp, -32
+; RV32IB-COMMON-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT:    mv s0, a2
+; RV32IB-COMMON-NEXT:    mv s1, a0
+; RV32IB-COMMON-NEXT:    lw a4, 0(a0)
+; RV32IB-COMMON-NEXT:    lw a5, 4(a0)
+; RV32IB-COMMON-NEXT:    mv s2, a1
+; RV32IB-COMMON-NEXT:    j .LBB7_2
+; RV32IB-COMMON-NEXT:  .LBB7_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    sw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    sw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    addi a1, sp, 8
+; RV32IB-COMMON-NEXT:    li a4, 5
+; RV32IB-COMMON-NEXT:    li a5, 5
+; RV32IB-COMMON-NEXT:    mv a0, s1
+; RV32IB-COMMON-NEXT:    call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT:    lw a4, 8(sp)
+; RV32IB-COMMON-NEXT:    lw a5, 12(sp)
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB7_7
+; RV32IB-COMMON-NEXT:  .LBB7_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT:    beq a5, s0, .LBB7_4
+; RV32IB-COMMON-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, a5, s0
+; RV32IB-COMMON-NEXT:    j .LBB7_5
+; RV32IB-COMMON-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    sltu a0, a4, s2
+; RV32IB-COMMON-NEXT:  .LBB7_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, a4
+; RV32IB-COMMON-NEXT:    mv a3, a5
+; RV32IB-COMMON-NEXT:    bnez a0, .LBB7_1
+; RV32IB-COMMON-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT:    # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT:    mv a2, s2
+; RV32IB-COMMON-NEXT:    mv a3, s0
+; RV32IB-COMMON-NEXT:    j .LBB7_1
+; RV32IB-COMMON-NEXT:  .LBB7_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT:    mv a0, a4
+; RV32IB-COMMON-NEXT:    mv a1, a5
+; RV32IB-COMMON-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT:    addi sp, sp, 32
+; RV32IB-COMMON-NEXT:    ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IB-ZALRSC:       # %bb.0:
+; RV64IB-ZALRSC-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT:    mv a3, a2
+; RV64IB-ZALRSC-NEXT:    bgeu a1, a3, .LBB7_3
+; RV64IB-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB7_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    mv a3, a1
+; RV64IB-ZALRSC-NEXT:  .LBB7_3: # in Loop: Header=BB7_1 Depth=1
+; RV64IB-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT:    bnez a3, .LBB7_1
+; RV64IB-ZALRSC-NEXT:  # %bb.4:
+; RV64IB-ZALRSC-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-NEXT:    ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IB-ZALRSC-PERM:       # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT:    lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    minu a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT:    bnez a3, .LBB7_1
+; RV64IB-ZALRSC-PERM-NEXT:  # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT:    mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT:    ret
+;
+; RV64IAB-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IAB:       # %bb.0:
+; RV64IAB-NEXT:    amominu.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT:    ret
+  %1 = atomicrmw umin ptr %a, i64 %b seq_cst
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 5e5f2b7..37e11db 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -81,6 +81,7 @@
 ; CHECK-NEXT:   optimized-nf7-segment-load-store - vlseg7eN.v and vsseg7eN.v are implemented as a wide memory op and shuffle.
 ; CHECK-NEXT:   optimized-nf8-segment-load-store - vlseg8eN.v and vsseg8eN.v are implemented as a wide memory op and shuffle.
 ; CHECK-NEXT:   optimized-zero-stride-load       - Optimized (perform fewer memory operations)zero-stride vector load.
+; CHECK-NEXT:   permissive-zalrsc                - Implementation permits non-base instructions between LR/SC pairs.
 ; CHECK-NEXT:   predictable-select-expensive     - Prefer likely predicted branches over selects.
 ; CHECK-NEXT:   prefer-vsetvli-over-read-vlenb   - Prefer vsetvli over read vlenb CSR to calculate VLEN.
 ; CHECK-NEXT:   prefer-w-inst                    - Prefer instructions with W suffix.
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll
new file mode 100644
index 0000000..d121c1a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll
@@ -0,0 +1,57 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+
+; CHECK: OpCapability GroupNonUniformArithmetic
+
+; CHECK-DAG:   %[[#f16:]] = OpTypeFloat 16
+; CHECK-DAG:   %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:   %[[#v4_half:]] = OpTypeVector %[[#f16]] 4
+; CHECK-DAG:   %[[#scope:]] = OpConstant %[[#uint]] 3
+
+; CHECK-LABEL: Begin function test_float
+; CHECK:   %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+define float @test_float(float %fexpr) {
+entry:
+; CHECK:   %[[#fret:]] = OpGroupNonUniformFMin %[[#f32]] %[[#scope]] Reduce %[[#fexpr]]
+  %0 = call float @llvm.spv.wave.reduce.min.f32(float %fexpr)
+  ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int_signed
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_signed(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformSMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+  %0 = call i32 @llvm.spv.wave.reduce.min.i32(i32 %iexpr)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_int_unsigned
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_unsigned(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformUMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+  %0 = call i32 @llvm.spv.wave.reduce.umin.i32(i32 %iexpr)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vhalf
+; CHECK:   %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]]
+define <4 x half> @test_vhalf(<4 x half> %vbexpr) {
+entry:
+; CHECK:   %[[#vhalfret:]] = OpGroupNonUniformFMin %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]]
+  %0 = call <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half> %vbexpr)
+  ret <4 x half> %0
+}
+
+declare float @llvm.spv.wave.reduce.min.f32(float)
+declare i32 @llvm.spv.wave.reduce.min.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half>)
+
+declare float @llvm.spv.wave.reduce.umin.f32(float)
+declare i32 @llvm.spv.wave.reduce.umin.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.umin.v4half(<4 x half>)
+
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
index ea4d32b..d087491 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
@@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; SSE2-LABEL: call_ret_v3bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    movl 4(%rdi), %eax
-; SSE2-NEXT:    pinsrw $0, %eax, %xmm1
+; SSE2-NEXT:    pinsrw $0, 4(%rdi), %xmm1
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    callq returns_v3bf16@PLT
@@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; AVXNECONVERT-LABEL: call_ret_v3bf16:
 ; AVXNECONVERT:       # %bb.0:
 ; AVXNECONVERT-NEXT:    pushq %rax
-; AVXNECONVERT-NEXT:    movl 4(%rdi), %eax
-; AVXNECONVERT-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNECONVERT-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
 ; AVXNECONVERT-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVXNECONVERT-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVXNECONVERT-NEXT:    callq returns_v3bf16@PLT
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
index 4dae143..d9c21d3 100644
--- a/llvm/test/CodeGen/X86/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown                   | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64
 
 ; Tests showing for the analysis of non-constant shift amounts to improve load address math
 
@@ -12,42 +12,20 @@
 define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub64_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    jne .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $48, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub64_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shrq %cl, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub64_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX-NEXT:    andb $48, %sil
-; AVX-NEXT:    shrxq %rsi, (%rdi), %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub64_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $48, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 63
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i64
@@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl (%esp,%edx), %eax
-; X86-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $112, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $48, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $112, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i128
@@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $96, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl (%esp,%eax), %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $32, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $32, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_32:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $96, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -32
   %sh = zext nneg i32 %idx_align to i128
@@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $64, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %eax
-; X86-NEXT:    movl 4(%esp,%ecx), %edx
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $64, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub128_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb $64, %sil
-; X64-NEXT:    je .LBB3_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB3_1:
-; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $64, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
 ; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -64
@@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl $24, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl 48(%esp,%edx), %eax
-; X86-NEXT:    movl 52(%esp,%edx), %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $63, %ecx
+; X86-NEXT:    movzbl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub512_8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $56, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rdx
-; SSE-NEXT:    shrq %cl, %rdx
-; SSE-NEXT:    movl -120(%rsp,%rsi), %eax
-; SSE-NEXT:    addl %eax, %eax
-; SSE-NEXT:    notl %ecx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    orl %edx, %eax
-; SSE-NEXT:    # kill: def $al killed $al killed $rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub512_8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $56, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    notl %ecx
-; AVX2-NEXT:    movl -120(%rsp,%rsi), %edx
-; AVX2-NEXT:    addl %edx, %edx
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $rax
-; AVX2-NEXT:    popq %rcx
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub512_8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $56, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX512-NEXT:    notl %ecx
-; AVX512-NEXT:    movl -120(%rsp,%rsi), %edx
-; AVX512-NEXT:    addl %edx, %edx
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT:    orl %ecx, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $rax
-; AVX512-NEXT:    popq %rcx
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub512_8:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $63, %esi
+; X64-NEXT:    movzbl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -8
   %ld = load i512, ptr %word, align 8
@@ -445,152 +143,21 @@ define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    andl $56, %ecx
-; X86-NEXT:    movl 48(%esp,%ecx), %eax
-; X86-NEXT:    movl 52(%esp,%ecx), %edx
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $56, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub512_64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub512_64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX2-NEXT:    popq %rcx
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub512_64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    popq %rcx
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub512_64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $56, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -64
   %sh = zext nneg i32 %idx_align to i512
@@ -603,143 +170,35 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %edi
-; X86-NEXT:    andl $48, %edi
-; X86-NEXT:    movl 48(%esp,%edi), %ecx
-; X86-NEXT:    movl 52(%esp,%edi), %edx
-; X86-NEXT:    movl 56(%esp,%edi), %esi
-; X86-NEXT:    movl 60(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $48, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    movl 4(%ecx,%edx), %edi
+; X86-NEXT:    movl 8(%ecx,%edx), %ebx
+; X86-NEXT:    movl 12(%ecx,%edx), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
-; SSE-LABEL: extractSub512_128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $48, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rax
-; SSE-NEXT:    movq -120(%rsp,%rsi), %rdx
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub512_128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    pushq %rax
-; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX-NEXT:    vmovups (%rdi), %ymm0
-; AVX-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    shrl $3, %esi
-; AVX-NEXT:    andl $48, %esi
-; AVX-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX-NEXT:    movq -120(%rsp,%rsi), %rdx
-; AVX-NEXT:    popq %rcx
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub512_128:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    andl $48, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    movq 8(%rdi,%rsi), %rdx
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 511
   %idx_align = and i32 %idx_bounds, -128
   %sh = zext nneg i32 %idx_align to i512
@@ -752,916 +211,21 @@ define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub4096_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $1536, %esp # imm = 0x600
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 100(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 108(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 120(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 124(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 132(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 140(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 148(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 152(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 156(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 160(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 164(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 168(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 172(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 176(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 180(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 184(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 188(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 192(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 196(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 200(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 204(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 208(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 212(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 216(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 220(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 224(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 228(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 232(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 236(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 240(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 244(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 248(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 252(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 256(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 260(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 264(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 268(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 272(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 276(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 280(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 284(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 288(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 292(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 296(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 300(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 304(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 308(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 312(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 316(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 320(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 324(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 328(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 332(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 336(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 340(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 344(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 348(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 352(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 356(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 360(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 364(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 368(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 372(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 376(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 380(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 384(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 388(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 392(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 396(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 400(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 404(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 408(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 412(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 416(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 420(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 424(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 428(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 432(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 436(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 440(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 444(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 448(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 452(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 456(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 460(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 464(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 468(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 472(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 476(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 480(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 484(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 488(%eax), %ebx
-; X86-NEXT:    movl 492(%eax), %edi
-; X86-NEXT:    movl 496(%eax), %esi
-; X86-NEXT:    movl 500(%eax), %edx
-; X86-NEXT:    movl 504(%eax), %ecx
-; X86-NEXT:    movl 508(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $4032, %ecx # imm = 0xFC0
-; X86-NEXT:    andl 12(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movl 496(%esp,%ecx), %eax
-; X86-NEXT:    movl 500(%esp,%ecx), %edx
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $4032, %edx # imm = 0xFC0
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub4096_64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    subq $1176, %rsp # imm = 0x498
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 16(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 32(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 48(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 64(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 80(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 96(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 112(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 128(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movups 144(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 160(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 176(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 192(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 208(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 224(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 240(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 256(%rdi), %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movups 272(%rdi), %xmm15
-; SSE-NEXT:    movups 288(%rdi), %xmm14
-; SSE-NEXT:    movups 304(%rdi), %xmm13
-; SSE-NEXT:    movups 320(%rdi), %xmm12
-; SSE-NEXT:    movups 336(%rdi), %xmm11
-; SSE-NEXT:    movups 352(%rdi), %xmm10
-; SSE-NEXT:    movups 368(%rdi), %xmm9
-; SSE-NEXT:    movups 384(%rdi), %xmm8
-; SSE-NEXT:    movups 400(%rdi), %xmm7
-; SSE-NEXT:    movups 416(%rdi), %xmm6
-; SSE-NEXT:    movups 432(%rdi), %xmm5
-; SSE-NEXT:    movups 448(%rdi), %xmm4
-; SSE-NEXT:    movups 464(%rdi), %xmm3
-; SSE-NEXT:    movups 480(%rdi), %xmm2
-; SSE-NEXT:    movups 496(%rdi), %xmm1
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $4032, %esi # imm = 0xFC0
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    movq 144(%rsp,%rsi), %rax
-; SSE-NEXT:    addq $1176, %rsp # imm = 0x498
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub4096_64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX2-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX2-NEXT:    vmovups 128(%rdi), %ymm4
-; AVX2-NEXT:    vmovups 160(%rdi), %ymm5
-; AVX2-NEXT:    vmovups 192(%rdi), %ymm6
-; AVX2-NEXT:    vmovups 224(%rdi), %ymm7
-; AVX2-NEXT:    vmovups 256(%rdi), %ymm8
-; AVX2-NEXT:    vmovups 288(%rdi), %ymm9
-; AVX2-NEXT:    vmovups 320(%rdi), %ymm10
-; AVX2-NEXT:    vmovups 352(%rdi), %ymm11
-; AVX2-NEXT:    vmovups 384(%rdi), %ymm12
-; AVX2-NEXT:    vmovups 416(%rdi), %ymm13
-; AVX2-NEXT:    vmovups 448(%rdi), %ymm14
-; AVX2-NEXT:    vmovups 480(%rdi), %ymm15
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm3, (%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    andl $4032, %esi # imm = 0xFC0
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    movq -96(%rsp,%rsi), %rax
-; AVX2-NEXT:    addq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: extractSub4096_64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $904, %rsp # imm = 0x388
-; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT:    vmovups (%rdi), %ymm0
-; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX512-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX512-NEXT:    vmovups 128(%rdi), %ymm4
-; AVX512-NEXT:    vmovups 160(%rdi), %ymm5
-; AVX512-NEXT:    vmovups 192(%rdi), %ymm6
-; AVX512-NEXT:    vmovups 224(%rdi), %ymm7
-; AVX512-NEXT:    vmovups 256(%rdi), %ymm8
-; AVX512-NEXT:    vmovups 288(%rdi), %ymm9
-; AVX512-NEXT:    vmovups 320(%rdi), %ymm10
-; AVX512-NEXT:    vmovups 352(%rdi), %ymm11
-; AVX512-NEXT:    vmovups 384(%rdi), %ymm12
-; AVX512-NEXT:    vmovups 416(%rdi), %ymm13
-; AVX512-NEXT:    andl $4032, %esi # imm = 0xFC0
-; AVX512-NEXT:    vmovups 448(%rdi), %ymm14
-; AVX512-NEXT:    vmovups 480(%rdi), %ymm15
-; AVX512-NEXT:    vxorps %xmm16, %xmm16, %xmm16
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm4, (%rsp)
-; AVX512-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT:    addq $904, %rsp # imm = 0x388
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: extractSub4096_64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $4032, %esi # imm = 0xFC0
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 4095
   %idx_align = and i32 %idx_bounds, -64
   %sh = zext nneg i32 %idx_align to i4096
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 81c4d5d..c3054a3 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -962,39 +962,22 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-NEXT:    retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movl %esi, %eax
-; X64-BMI2-NEXT:    shrb $6, %al
-; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    leal (,%rsi,8), %eax
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    andl $7, %esi
+; X64-NEXT:    movzbl (%rsi,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
@@ -3417,7 +3400,6 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X86: {{.*}}
 ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8d36eef..84c2cc6 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1220,41 +1220,23 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; no @load_16byte_chunk_of_16byte_alloca
 
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
-; X64-NO-BMI2-NEXT:    retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movl %esi, %eax
-; X64-BMI2-NEXT:    shrb $6, %al
-; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64:       # %bb.0:
+; X64-NEXT:    movups (%rdi), %xmm0
+; X64-NEXT:    movups 16(%rdi), %xmm1
+; X64-NEXT:    leal (,%rsi,8), %eax
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    shrb $6, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT:    andl $7, %esi
+; X64-NEXT:    movzbl (%rsi,%rax), %eax
+; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
@@ -2156,7 +2138,6 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; no @load_32byte_chunk_of_32byte_alloca
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X86: {{.*}}
 ; X86-NO-SHLD: {{.*}}
diff --git a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s
index 8bd9148..4542027 100644
--- a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s
+++ b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s
@@ -2,7 +2,7 @@
 
 // CHECK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
 buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc
-// CHECK: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 slc ; <MCInst #13135 BUFFER_LOAD_DWORDX4_OFFEN_gfx11
+// CHECK: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 slc ; <MCInst #{{[0-9]+}} BUFFER_LOAD_DWORDX4_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:10104>
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
@@ -11,7 +11,7 @@ buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc
 // CHECK-NEXT: ;  <MCOperand Imm:2>
 // CHECK-NEXT: ;  <MCOperand Imm:0>>
 buffer_store_dword v0, v1, s[0:3], 0 offen slc
-// CHECK: buffer_store_b32 v0, v1, s[0:3], 0 offen slc ; <MCInst #14553 BUFFER_STORE_DWORD_OFFEN_gfx11
+// CHECK: buffer_store_b32 v0, v1, s[0:3], 0 offen slc ; <MCInst #{{[0-9]+}} BUFFER_STORE_DWORD_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:487>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
@@ -22,7 +22,7 @@ buffer_store_dword v0, v1, s[0:3], 0 offen slc
 
 ; tbuffer ops use autogenerate asm parsers
 tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc
-// CHECK: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc ; <MCInst #34095 TBUFFER_LOAD_FORMAT_XYZW_OFFEN_gfx11
+// CHECK: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc ; <MCInst #{{[0-9]+}} TBUFFER_LOAD_FORMAT_XYZW_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:10104>
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
@@ -32,7 +32,7 @@ tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen
 // CHECK-NEXT: ;  <MCOperand Imm:2>
 // CHECK-NEXT: ;  <MCOperand Imm:0>>
 tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc
-// CHECK: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc ; <MCInst #34264 TBUFFER_STORE_FORMAT_D16_X_OFFEN_gfx11
+// CHECK: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc ; <MCInst #{{[0-9]+}} TBUFFER_STORE_FORMAT_D16_X_OFFEN_gfx11
 // CHECK-NEXT: ;  <MCOperand Reg:486>
 // CHECK-NEXT: ;  <MCOperand Reg:487>
 // CHECK-NEXT: ;  <MCOperand Reg:7754>
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index 054489c..f5cb4b7 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -286,6 +286,9 @@
 #CHECK: xvmulhuh  4, 5, 7
 0xf0,0x85,0x3b,0xd0
 
+#CHECK: mtlpl 3, 4
+0x7c,0x80,0x1a,0x26
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0xed,0x03,0x22,0x08
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 17d1413..f0df8ce 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -280,6 +280,9 @@
 #CHECK: xvmulhuh  4, 5, 7
 0xd0,0x3b,0x85,0xf0
 
+#CHECK: mtlpl 3, 4
+0x26,0x1a,0x80,0x7c
+
 #CHECK: xxmulmul 8, 3, 4, 2
 0x08,0x22,0x03,0xed
 
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index e5bc1f4..bc0683e 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -403,6 +403,10 @@
 #CHECK-BE: xvmulhuh 4, 5, 7              # encoding: [0xf0,0x85,0x3b,0xd0]
 #CHECK-LE: xvmulhuh 4, 5, 7              # encoding: [0xd0,0x3b,0x85,0xf0]
 
+           mtlpl 3, 4
+#CHECK-BE: mtlpl 3, 4                     # encoding: [0x7c,0x80,0x1a,0x26]
+#CHECK-LE: mtlpl 3, 4                     # encoding: [0x26,0x1a,0x80,0x7c]
+
            xxmulmul 8, 3, 4, 2
 #CHECK-BE: xxmulmul 8, 3, 4, 2          # encoding: [0xed,0x03,0x22,0x08]
 #CHECK-LE: xxmulmul 8, 3, 4, 2          # encoding: [0x08,0x22,0x03,0xed]
diff --git a/llvm/test/MC/RISCV/xqcili-linker-relaxation.s b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s
new file mode 100644
index 0000000..ace6779
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc --triple=riscv32 -mattr=+relax,+experimental-xqcili \
+# RUN:    %s -filetype=obj -o - -riscv-add-build-attributes \
+# RUN:    | llvm-objdump -dr -M no-aliases - \
+# RUN:    | FileCheck %s
+
+## This tests that we correctly emit relocations for linker relaxation when
+## emitting `QC.E.LI` and `QC.LI`.
+
+  .section .text.ex1, "ax", @progbits
+# CHECK-LABEL: <.text.ex1>:
+  blez    a1, .L1
+# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex1>
+# CHECK-NEXT: R_RISCV_BRANCH .L1{{$}}
+  qc.e.li a0, sym
+# CHECK-NEXT: qc.e.li a0, 0x0
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# CHECK-NEXT: R_RISCV_CUSTOM194 sym{{$}}
+# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}}
+.L1:
+# CHECK: <.L1>:
+  ret
+# CHECK-NEXT: c.jr ra
+
+  .section .text.ex2, "ax", @progbits
+# CHECK-LABEL: <.text.ex2>:
+  blez    a1, .L2
+# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex2>
+# CHECK-NEXT: R_RISCV_BRANCH .L2{{$}}
+  qc.li a0,  %qc.abs20(sym)
+# CHECK-NEXT: qc.li a0, 0x0
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# CHECK-NEXT: R_RISCV_CUSTOM192 sym{{$}}
+# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}}
+.L2:
+# CHECK: <.L2>:
+  ret
+# CHECK-NEXT: c.jr ra
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 7b0b871..cc87d65 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -10,8 +10,8 @@ declare void @llvm.assume(i1) #1
 
 ; Check that the assume has not been removed:
 
-define i32 @foo1(ptr %a) #0 {
-; DEFAULT-LABEL: @foo1(
+define i32 @align_to_bundle(ptr %a) #0 {
+; DEFAULT-LABEL: @align_to_bundle(
 ; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64
 ; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
@@ -19,7 +19,7 @@ define i32 @foo1(ptr %a) #0 {
 ; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; DEFAULT-NEXT:    ret i32 [[T0]]
 ;
-; BUNDLES-LABEL: @foo1(
+; BUNDLES-LABEL: @align_to_bundle(
 ; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
 ; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ]
 ; BUNDLES-NEXT:    ret i32 [[T0]]
@@ -32,6 +32,28 @@ define i32 @foo1(ptr %a) #0 {
   ret i32 %t0
 }
 
+define i32 @align_to_bundle_ptrtoaddr(ptr %a) #0 {
+; DEFAULT-LABEL: @align_to_bundle_ptrtoaddr(
+; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; DEFAULT-NEXT:    [[PTRINT:%.*]] = ptrtoaddr ptr [[A]] to i64
+; DEFAULT-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; DEFAULT-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; DEFAULT-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; DEFAULT-NEXT:    ret i32 [[T0]]
+;
+; BUNDLES-LABEL: @align_to_bundle_ptrtoaddr(
+; BUNDLES-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
+; BUNDLES-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 32) ]
+; BUNDLES-NEXT:    ret i32 [[T0]]
+;
+  %t0 = load i32, ptr %a, align 4
+  %ptrint = ptrtoaddr ptr %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  ret i32 %t0
+}
+
 define i32 @align_assume_trunc_cond(ptr %a) #0 {
 ; DEFAULT-LABEL: @align_assume_trunc_cond(
 ; DEFAULT-NEXT:    [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index a7434a2..adf3aa1 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -237,3 +237,75 @@ define ptr addrspace(1) @gep_sub_ptrtoaddr_different_obj_addrsize(ptr addrspace(
   call void @use.i32(i32 %addr)
   ret ptr addrspace(1) %gep
 }
+
+define i64 @ptrtoaddr_of_ptrmask(ptr %p, i64 %mask) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_ptrmask(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[MASK:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[ADDR:%.*]] = and i64 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    ret i64 [[ADDR]]
+;
+  %masked = call ptr @llvm.ptrmask(ptr %p, i64 %mask)
+  %addr = ptrtoaddr ptr %masked to i64
+  ret i64 %addr
+}
+
+define i32 @ptrtoaddr_of_ptrmask_addrsize(ptr addrspace(1) %p, i32 %mask) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_ptrmask_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[MASK:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr addrspace(1) [[P]] to i32
+; CHECK-NEXT:    [[ADDR:%.*]] = and i32 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADDR]]
+;
+  %masked = call ptr addrspace(1) @llvm.ptrmask(ptr addrspace(1) %p, i32 %mask)
+  %addr = ptrtoaddr ptr addrspace(1) %masked to i32
+  ret i32 %addr
+}
+
+define i64 @ptrtoaddr_of_gep_of_inttoptr(i64 %int, i64 %offset) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_inttoptr(
+; CHECK-SAME: i64 [[INT:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[ADDR:%.*]] = add i64 [[INT]], [[OFFSET]]
+; CHECK-NEXT:    ret i64 [[ADDR]]
+;
+  %ptr = inttoptr i64 %int to ptr
+  %gep = getelementptr i8, ptr %ptr, i64 %offset
+  %addr = ptrtoaddr ptr %gep to i64
+  ret i64 %addr
+}
+
+; FIXME: This could be supported by truncating %int before performing the
+; arithmetic.
+define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize(i64 %int, i32 %offset) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_inttoptr_addrsize(
+; CHECK-SAME: i64 [[INT:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i64 [[INT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i32 [[OFFSET]]
+; CHECK-NEXT:    [[ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[GEP]] to i32
+; CHECK-NEXT:    ret i32 [[ADDR]]
+;
+  %ptr = inttoptr i64 %int to ptr addrspace(1)
+  %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+  %addr = ptrtoaddr ptr addrspace(1) %gep to i32
+  ret i32 %addr
+}
+
+define i64 @ptrtoaddr_of_gep_of_null(i64 %offset) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_gep_of_null(
+; CHECK-SAME: i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i64 [[OFFSET]]
+;
+  %gep = getelementptr i8, ptr null, i64 %offset
+  %addr = ptrtoaddr ptr %gep to i64
+  ret i64 %addr
+}
+
+define i32 @ptrtoaddr_of_gep_of_null_addrsize(i32 %offset) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_gep_of_null_addrsize(
+; CHECK-SAME: i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i32 [[OFFSET]]
+;
+  %gep = getelementptr i8, ptr addrspace(1) null, i32 %offset
+  %addr = ptrtoaddr ptr addrspace(1) %gep to i32
+  ret i32 %addr
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index 9f9e3f9..77a7f0d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,26 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s
-
-declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a)
-declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
-declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a)
-
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
 
 define i32 @add_0() {
 ; CHECK-LABEL: @add_0(
@@ -30,6 +10,15 @@ define i32 @add_0() {
   ret i32 %x
 }
 
+define i32 @add_0_scalable_vector() {
+; CHECK-LABEL: @add_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @add_1() {
 ; CHECK-LABEL: @add_1(
 ; CHECK-NEXT:    ret i32 8
@@ -38,6 +27,15 @@ define i32 @add_1() {
   ret i32 %x
 }
 
+define i32 @add_1_scalable_vector() {
+; CHECK-LABEL: @add_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @add_inc() {
 ; CHECK-LABEL: @add_inc(
 ; CHECK-NEXT:    ret i32 18
@@ -63,8 +61,17 @@ define i32 @add_undef() {
   ret i32 %x
 }
 
-define i32 @add_undef1() {
-; CHECK-LABEL: @add_undef1(
+define i32 @add_undef_scalable_vector() {
+; CHECK-LABEL: @add_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @add_undef_elt() {
+; CHECK-LABEL: @add_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -80,8 +87,17 @@ define i32 @add_poison() {
   ret i32 %x
 }
 
-define i32 @add_poison1() {
-; CHECK-LABEL: @add_poison1(
+define i32 @add_poison_scalable_vector() {
+; CHECK-LABEL: @add_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @add_poison_elt() {
+; CHECK-LABEL: @add_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 poison, i32 1, i32 1, i32 42, i32 1, i32 1>)
@@ -105,6 +121,15 @@ define i32 @mul_0() {
   ret i32 %x
 }
 
+define i32 @mul_0_scalable_vector() {
+; CHECK-LABEL: @mul_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @mul_1() {
 ; CHECK-LABEL: @mul_1(
 ; CHECK-NEXT:    ret i32 1
@@ -113,6 +138,15 @@ define i32 @mul_1() {
   ret i32 %x
 }
 
+define i32 @mul_1_scalable_vector() {
+; CHECK-LABEL: @mul_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @mul_inc() {
 ; CHECK-LABEL: @mul_inc(
 ; CHECK-NEXT:    ret i32 40320
@@ -138,8 +172,17 @@ define i32 @mul_undef() {
   ret i32 %x
 }
 
-define i32 @mul_undef1() {
-; CHECK-LABEL: @mul_undef1(
+define i32 @mul_undef_scalable_vector() {
+; CHECK-LABEL: @mul_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @mul_undef_elt() {
+; CHECK-LABEL: @mul_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -155,8 +198,17 @@ define i32 @mul_poison() {
   ret i32 %x
 }
 
-define i32 @mul_poison1() {
-; CHECK-LABEL: @mul_poison1(
+define i32 @mul_poison_scalable_vector() {
+; CHECK-LABEL: @mul_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @mul_poison_elt() {
+; CHECK-LABEL: @mul_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 0, i32 1, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -171,6 +223,15 @@ define i32 @and_0() {
   ret i32 %x
 }
 
+define i32 @and_0_scalable_vector() {
+; CHECK-LABEL: @and_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @and_1() {
 ; CHECK-LABEL: @and_1(
 ; CHECK-NEXT:    ret i32 1
@@ -179,6 +240,15 @@ define i32 @and_1() {
   ret i32 %x
 }
 
+define i32 @and_1_scalable_vector() {
+; CHECK-LABEL: @and_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @and_inc() {
 ; CHECK-LABEL: @and_inc(
 ; CHECK-NEXT:    ret i32 0
@@ -204,8 +274,17 @@ define i32 @and_undef() {
   ret i32 %x
 }
 
-define i32 @and_undef1() {
-; CHECK-LABEL: @and_undef1(
+define i32 @and_undef_scalable_vector() {
+; CHECK-LABEL: @and_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @and_undef_elt() {
+; CHECK-LABEL: @and_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -221,8 +300,17 @@ define i32 @and_poison() {
   ret i32 %x
 }
 
-define i32 @and_poison1() {
-; CHECK-LABEL: @and_poison1(
+define i32 @and_poison_scalable_vector() {
+; CHECK-LABEL: @and_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @and_poison_elt() {
+; CHECK-LABEL: @and_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> <i32 -1, i32 1, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -237,6 +325,15 @@ define i32 @or_0() {
   ret i32 %x
 }
 
+define i32 @or_0_scalable_vector() {
+; CHECK-LABEL: @or_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @or_1() {
 ; CHECK-LABEL: @or_1(
 ; CHECK-NEXT:    ret i32 1
@@ -245,6 +342,15 @@ define i32 @or_1() {
   ret i32 %x
 }
 
+define i32 @or_1_scalable_vector() {
+; CHECK-LABEL: @or_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @or_inc() {
 ; CHECK-LABEL: @or_inc(
 ; CHECK-NEXT:    ret i32 -1
@@ -270,8 +376,17 @@ define i32 @or_undef() {
   ret i32 %x
 }
 
-define i32 @or_undef1() {
-; CHECK-LABEL: @or_undef1(
+define i32 @or_undef_scalable_vector() {
+; CHECK-LABEL: @or_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.v8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @or_undef_elt() {
+; CHECK-LABEL: @or_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -287,8 +402,17 @@ define i32 @or_poison() {
   ret i32 %x
 }
 
-define i32 @or_poison1() {
-; CHECK-LABEL: @or_poison1(
+define i32 @or_poison_scalable_vector() {
+; CHECK-LABEL: @or_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @or_poison_elt() {
+; CHECK-LABEL: @or_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 0, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -303,6 +427,15 @@ define i32 @xor_0() {
   ret i32 %x
 }
 
+define i32 @xor_0_scalable_vector() {
+; CHECK-LABEL: @xor_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @xor_1() {
 ; CHECK-LABEL: @xor_1(
 ; CHECK-NEXT:    ret i32 0
@@ -311,6 +444,15 @@ define i32 @xor_1() {
   ret i32 %x
 }
 
+define i32 @xor_1_scalable_vector() {
+; CHECK-LABEL: @xor_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @xor_inc() {
 ; CHECK-LABEL: @xor_inc(
 ; CHECK-NEXT:    ret i32 10
@@ -336,8 +478,17 @@ define i32 @xor_undef() {
   ret i32 %x
 }
 
-define i32 @xor_undef1() {
-; CHECK-LABEL: @xor_undef1(
+define i32 @xor_undef_scalable_vector() {
+; CHECK-LABEL: @xor_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @xor_undef_elt() {
+; CHECK-LABEL: @xor_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -353,8 +504,17 @@ define i32 @xor_poison() {
   ret i32 %x
 }
 
-define i32 @xor_poison1() {
-; CHECK-LABEL: @xor_poison1(
+define i32 @xor_poison_scalable_vector() {
+; CHECK-LABEL: @xor_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @xor_poison_elt() {
+; CHECK-LABEL: @xor_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> <i32 poison, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
@@ -369,6 +529,15 @@ define i32 @smin_0() {
   ret i32 %x
 }
 
+define i32 @smin_0_scalable_vector() {
+; CHECK-LABEL: @smin_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @smin_1() {
 ; CHECK-LABEL: @smin_1(
 ; CHECK-NEXT:    ret i32 1
@@ -377,6 +546,15 @@ define i32 @smin_1() {
   ret i32 %x
 }
 
+define i32 @smin_1_scalable_vector() {
+; CHECK-LABEL: @smin_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @smin_inc() {
 ; CHECK-LABEL: @smin_inc(
 ; CHECK-NEXT:    ret i32 -6
@@ -402,8 +580,17 @@ define i32 @smin_undef() {
   ret i32 %x
 }
 
-define i32 @smin_undef1() {
-; CHECK-LABEL: @smin_undef1(
+define i32 @smin_undef_scalable_vector() {
+; CHECK-LABEL: @smin_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smin_undef_elt() {
+; CHECK-LABEL: @smin_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -419,8 +606,17 @@ define i32 @smin_poison() {
   ret i32 %x
 }
 
-define i32 @smin_poison1() {
-; CHECK-LABEL: @smin_poison1(
+define i32 @smin_poison_scalable_vector() {
+; CHECK-LABEL: @smin_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @smin_poison_elt() {
+; CHECK-LABEL: @smin_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 poison, i32 1, i32 1, i32 1>)
@@ -435,6 +631,15 @@ define i32 @smax_0() {
   ret i32 %x
 }
 
+define i32 @smax_0_scalable_vector() {
+; CHECK-LABEL: @smax_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @smax_1() {
 ; CHECK-LABEL: @smax_1(
 ; CHECK-NEXT:    ret i32 1
@@ -443,6 +648,15 @@ define i32 @smax_1() {
   ret i32 %x
 }
 
+define i32 @smax_1_scalable_vector() {
+; CHECK-LABEL: @smax_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @smax_inc() {
 ; CHECK-LABEL: @smax_inc(
 ; CHECK-NEXT:    ret i32 8
@@ -468,8 +682,17 @@ define i32 @smax_undef() {
   ret i32 %x
 }
 
-define i32 @smax_undef1() {
-; CHECK-LABEL: @smax_undef1(
+define i32 @smax_undef_scalable_vector() {
+; CHECK-LABEL: @smax_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smax_undef_elt() {
+; CHECK-LABEL: @smax_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -485,8 +708,17 @@ define i32 @smax_poison() {
   ret i32 %x
 }
 
-define i32 @smax_poison1() {
-; CHECK-LABEL: @smax_poison1(
+define i32 @smax_poison_scalable_vector() {
+; CHECK-LABEL: @smax_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @smax_poison_elt() {
+; CHECK-LABEL: @smax_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1, i32 poison>)
@@ -501,6 +733,15 @@ define i32 @umin_0() {
   ret i32 %x
 }
 
+define i32 @umin_0_scalable_vector() {
+; CHECK-LABEL: @umin_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @umin_1() {
 ; CHECK-LABEL: @umin_1(
 ; CHECK-NEXT:    ret i32 1
@@ -509,6 +750,15 @@ define i32 @umin_1() {
   ret i32 %x
 }
 
+define i32 @umin_1_scalable_vector() {
+; CHECK-LABEL: @umin_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+  ret i32 %x
+}
+
 define i32 @umin_inc() {
 ; CHECK-LABEL: @umin_inc(
 ; CHECK-NEXT:    ret i32 1
@@ -534,8 +784,17 @@ define i32 @umin_undef() {
   ret i32 %x
 }
 
-define i32 @umin_undef1() {
-; CHECK-LABEL: @umin_undef1(
+define i32 @umin_undef_scalable_vector() {
+; CHECK-LABEL: @umin_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umin_undef_elt() {
+; CHECK-LABEL: @umin_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -551,8 +810,17 @@ define i32 @umin_poison() {
   ret i32 %x
 }
 
-define i32 @umin_poison1() {
-; CHECK-LABEL: @umin_poison1(
+define i32 @umin_poison_scalable_vector() {
+; CHECK-LABEL: @umin_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @umin_poison_elt() {
+; CHECK-LABEL: @umin_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 -1, i32 poison, i32 1, i32 1, i32 1, i32 1>)
@@ -567,6 +835,15 @@ define i32 @umax_0() {
   ret i32 %x
 }
 
+define i32 @umax_0_scalable_vector() {
+; CHECK-LABEL: @umax_0_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
 define i32 @umax_1() {
 ; CHECK-LABEL: @umax_1(
 ; CHECK-NEXT:    ret i32 1
@@ -575,6 +852,15 @@ define i32 @umax_1() {
   ret i32 %x
 }
 
+define i32 @umax_1_scalable_vector() {
+; CHECK-LABEL: @umax_1_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @umax_inc() {
 ; CHECK-LABEL: @umax_inc(
 ; CHECK-NEXT:    ret i32 -3
@@ -600,8 +886,17 @@ define i32 @umax_undef() {
   ret i32 %x
 }
 
-define i32 @umax_undef1() {
-; CHECK-LABEL: @umax_undef1(
+define i32 @umax_undef_scalable_vector() {
+; CHECK-LABEL: @umax_undef_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umax_undef_elt() {
+; CHECK-LABEL: @umax_undef_elt(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
@@ -617,8 +912,17 @@ define i32 @umax_poison() {
   ret i32 %x
 }
 
-define i32 @umax_poison1() {
-; CHECK-LABEL: @umax_poison1(
+define i32 @umax_poison_scalable_vector() {
+; CHECK-LABEL: @umax_poison_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison)
+  ret i32 %x
+}
+
+define i32 @umax_poison_elt() {
+; CHECK-LABEL: @umax_poison_elt(
 ; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 poison, i32 1, i32 1, i32 poison, i32 1, i32 1>)
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 964a257..fafa82c 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -2800,6 +2800,88 @@ exit:
   ret i64 %r.0.lcssa
 }
 
+define i32 @reduction_expression_ext_mulacc_livein(ptr %a, i16 %c) {
+; CHECK-LABEL: define i32 @reduction_expression_ext_mulacc_livein(
+; CHECK-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @reduction_expression_ext_mulacc_livein(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i16>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add i32 [[VEC_PHI]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i16
+  %mul = mul i16 %c, %ext.a
+  %mul.ext = zext i16 %mul to i32
+  %add = add i32 %mul.ext, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
 declare float @llvm.fmuladd.f32(float, float, float)
 
 !6 = distinct !{!6, !7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 06b0448..291ada8 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -800,3 +800,545 @@ exit:
   %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
   ret i64 %r.0.lcssa
 }
+
+define i32 @print_mulacc_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32))
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32
+; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<63>
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, 63
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
+
+; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128
+define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_not_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>)
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = sext i8 %l to i32
+  %mul = mul i32 %l.ext, 128
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.next.lcssa = phi i32 [ %red.next, %loop ]
+  ret i32 %red.next.lcssa
+}
+
+define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_ext_mulacc_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64))
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %mul.ext = zext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST vp<%4> = zext ir<%l> to i64
+; CHECK-NEXT:    WIDEN ir<%mul> = mul vp<%4>, ir<63>
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%6> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%6> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%6>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
+; CHECK-NEXT:    IR   %mul.ext = zext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, 63
+  %mul.ext = zext i32 %mul to i64
+  %red.next = add i64 %red, %mul.ext
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128
+define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_ext_mulacc_not_extended_const'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
+; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
+; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
+; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:      WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
+; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (ir<%mul> sext to i64)
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %mul.ext = sext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
+; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
+; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
+; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
+; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
+; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
+; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
+; CHECK-NEXT:  Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
+; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
+; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
+; CHECK-NEXT:    WIDEN-CAST ir<%mul.ext> = sext ir<%mul> to i64
+; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul.ext>)
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
+; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<scalar.ph>:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
+; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
+; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
+; CHECK-NEXT:    IR   %mul.ext = sext i32 %mul to i64
+; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
+; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = sext i8 %l to i32
+  %mul = mul i32 %l.ext, 128
+  %mul.ext = sext i32 %mul to i64
+  %red.next = add i64 %red, %mul.ext
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.next.lcssa = phi i64 [ %red.next, %loop ]
+  ret i64 %red.next.lcssa
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll
new file mode 100644
index 0000000..9e96e93
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i64 @test(ptr %arg1, i64 %alloca.promoted344, i8 %load.311.i, i1 %load1.i) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: ptr [[ARG1:%.*]], i64 [[ALLOCA_PROMOTED344:%.*]], i8 [[LOAD_311_I:%.*]], i1 [[LOAD1_I:%.*]]) {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i8> <i8 0, i8 0, i8 0, i8 poison>, i8 [[LOAD_311_I]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> <i8 poison, i8 poison, i8 0, i8 0>, i8 [[LOAD_311_I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[ALLOCA_PROMOTED344]], i32 0
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[BB]] ], [ [[TMP28:%.*]], %[[BB12_8_I:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <8 x i8> [ zeroinitializer, %[[BB]] ], [ [[TMP29:%.*]], %[[BB12_8_I]] ]
+; CHECK-NEXT:    br i1 [[LOAD1_I]], label %[[SPAM_EXIT:.*]], label %[[BB4_LR_PH_I:.*]]
+; CHECK:       [[BB4_LR_PH_I]]:
+; CHECK-NEXT:    br i1 true, label %[[BB3_I_I_PEEL:.*]], label %[[EGGS_EXIT_I_PEEL:.*]]
+; CHECK:       [[BB3_I_I_PEEL]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP3]], splat (i64 1)
+; CHECK-NEXT:    [[LOAD4_I_I_PEEL:%.*]] = load i64, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[SHL_I_I_PEEL:%.*]] = shl i64 [[LOAD4_I_I_PEEL]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[SHL_I_I_PEEL]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    br label %[[EGGS_EXIT_I_PEEL]]
+; CHECK:       [[EGGS_EXIT_I_PEEL]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i64> [ [[TMP10]], %[[BB3_I_I_PEEL]] ], [ zeroinitializer, %[[BB4_LR_PH_I]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    br label %[[SPAM_EXIT]]
+; CHECK:       [[SPAM_EXIT]]:
+; CHECK-NEXT:    [[GETELEMENTPTR_I_I_PROMOTED346:%.*]] = phi i64 [ [[TMP14]], %[[EGGS_EXIT_I_PEEL]] ], [ 0, %[[BB2]] ]
+; CHECK-NEXT:    [[LOAD_8_I:%.*]] = phi i8 [ 0, %[[EGGS_EXIT_I_PEEL]] ], [ 1, %[[BB2]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i8> [ [[TMP13]], %[[EGGS_EXIT_I_PEEL]] ], [ zeroinitializer, %[[BB2]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    br i1 [[LOAD1_I]], label %[[BB12_8_I]], label %[[BB12_1_THREAD_I:.*]]
+; CHECK:       [[BB12_1_THREAD_I]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i8> [[TMP4]], i32 0
+; CHECK-NEXT:    [[ICMP5_3_I:%.*]] = icmp eq i8 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_3_I]], label %[[BB12_3_I:.*]], label %[[BB8_3_I:.*]]
+; CHECK:       [[BB8_3_I]]:
+; CHECK-NEXT:    br label %[[BB12_3_I]]
+; CHECK:       [[BB12_3_I]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i8> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ICMP5_4_I:%.*]] = icmp eq i8 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_4_I]], label %[[BB12_4_I:.*]], label %[[BB8_4_I:.*]]
+; CHECK:       [[BB8_4_I]]:
+; CHECK-NEXT:    br label %[[BB12_4_I]]
+; CHECK:       [[BB12_4_I]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i8> [[TMP4]], i32 2
+; CHECK-NEXT:    [[ICMP5_5_I:%.*]] = icmp eq i8 [[TMP19]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_5_I]], label %[[BB12_5_I:.*]], label %[[BB8_5_I:.*]]
+; CHECK:       [[BB8_5_I]]:
+; CHECK-NEXT:    br label %[[BB12_5_I]]
+; CHECK:       [[BB12_5_I]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i8> [[TMP4]], i32 3
+; CHECK-NEXT:    [[ICMP5_7_I:%.*]] = icmp eq i8 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_7_I]], label %[[BB12_7_I:.*]], label %[[BB8_7_I:.*]]
+; CHECK:       [[BB8_7_I]]:
+; CHECK-NEXT:    br label %[[BB12_7_I]]
+; CHECK:       [[BB12_7_I]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x i8> [[TMP4]], i32 4
+; CHECK-NEXT:    [[ICMP5_8_I:%.*]] = icmp eq i8 [[TMP21]], 0
+; CHECK-NEXT:    br i1 [[ICMP5_8_I]], label %[[BB12_8_I]], label %[[BB8_8_I:.*]]
+; CHECK:       [[BB8_8_I]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[LOAD_8_I]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i8> poison, i8 [[LOAD_8_I]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <4 x i32> <i32 poison, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br label %[[BB12_8_I]]
+; CHECK:       [[BB12_8_I]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i8> [ [[TMP0]], %[[BB12_7_I]] ], [ [[TMP22]], %[[BB8_8_I]] ], [ [[TMP15]], %[[SPAM_EXIT]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i8> [ zeroinitializer, %[[BB12_7_I]] ], [ [[TMP25]], %[[BB8_8_I]] ], [ [[TMP16]], %[[SPAM_EXIT]] ]
+; CHECK-NEXT:    [[TMP28]] = insertelement <2 x i64> [[TMP2]], i64 [[GETELEMENTPTR_I_I_PROMOTED346]], i32 1
+; CHECK-NEXT:    [[TMP29]] = shufflevector <4 x i8> [[TMP26]], <4 x i8> [[TMP27]], <8 x i32> <i32 2, i32 7, i32 5, i32 0, i32 1, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    br label %[[BB2]]
+;
+bb:
+  br label %bb2
+
+bb2:
+  %getelementptr.i.i.promoted = phi i64 [ 0, %bb ], [ %getelementptr.i.i.promoted346, %bb12.8.i ]
+  %alloca.promoted = phi i64 [ 0, %bb ], [ %alloca.promoted344, %bb12.8.i ]
+  %load.8.i231 = phi i8 [ 0, %bb ], [ %load.8.i239, %bb12.8.i ]
+  %load.7.i217 = phi i8 [ 0, %bb ], [ %load.7.i225, %bb12.8.i ]
+  %load.626.i200 = phi i8 [ 0, %bb ], [ %load.626.i208, %bb12.8.i ]
+  %load.6.i183 = phi i8 [ 0, %bb ], [ %load.6.i191, %bb12.8.i ]
+  %load.5.i167 = phi i8 [ 0, %bb ], [ %load.5.i175, %bb12.8.i ]
+  %load.418.i148 = phi i8 [ 0, %bb ], [ %load.418.i156, %bb12.8.i ]
+  %load.4.i129 = phi i8 [ 0, %bb ], [ %load.4.i137, %bb12.8.i ]
+  %load.3.i111 = phi i8 [ 0, %bb ], [ %load.3.i119, %bb12.8.i ]
+  br i1 %load1.i, label %spam.exit, label %bb4.lr.ph.i
+
+bb4.lr.ph.i:
+  br i1 true, label %bb3.i.i.peel, label %eggs.exit.i.peel
+
+bb3.i.i.peel:
+  %and.i.i.peel = and i64 %alloca.promoted, 1
+  %load4.i.i.peel = load i64, ptr %arg1, align 8
+  %shl.i.i.peel = shl i64 %load4.i.i.peel, 1
+  %or.i.i.peel = or i64 %shl.i.i.peel, %and.i.i.peel
+  %and6.i.i.peel = and i64 %getelementptr.i.i.promoted, 1
+  %xor.i.i.peel = xor i64 %and6.i.i.peel, %alloca.promoted
+  br label %eggs.exit.i.peel
+
+eggs.exit.i.peel:
+  %load5.i.i93.peel = phi i64 [ %xor.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ]
+  %or.i.i91.peel = phi i64 [ %or.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ]
+  %0 = trunc i64 %or.i.i91.peel to i8
+  %1 = trunc nuw i64 %or.i.i91.peel to i8
+  %2 = trunc i64 %load5.i.i93.peel to i8
+  br label %spam.exit
+
+spam.exit:
+  %getelementptr.i.i.promoted346 = phi i64 [ %load5.i.i93.peel, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  %load.834.i = phi i8 [ %2, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  %load.7.i25 = phi i8 [ %1, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  %load.8.i = phi i8 [ 0, %eggs.exit.i.peel ], [ 1, %bb2 ]
+  %load.6.i18 = phi i8 [ %0, %eggs.exit.i.peel ], [ 0, %bb2 ]
+  br i1 %load1.i, label %bb12.8.i, label %bb12.1.thread.i
+
+bb12.1.thread.i:
+  %icmp5.3.i = icmp eq i8 %load.3.i111, 0
+  br i1 %icmp5.3.i, label %bb12.3.i, label %bb8.3.i
+
+bb8.3.i:
+  br label %bb12.3.i
+
+bb12.3.i:
+  %icmp5.4.i = icmp eq i8 %load.4.i129, 0
+  br i1 %icmp5.4.i, label %bb12.4.i, label %bb8.4.i
+
+bb8.4.i:
+  br label %bb12.4.i
+
+bb12.4.i:
+  %icmp5.5.i = icmp eq i8 %load.5.i167, 0
+  br i1 %icmp5.5.i, label %bb12.5.i, label %bb8.5.i
+
+bb8.5.i:
+  br label %bb12.5.i
+
+bb12.5.i:
+  %icmp5.7.i = icmp eq i8 %load.7.i217, 0
+  br i1 %icmp5.7.i, label %bb12.7.i, label %bb8.7.i
+
+bb8.7.i:
+  br label %bb12.7.i
+
+bb12.7.i:
+  %icmp5.8.i = icmp eq i8 %load.8.i231, 0
+  br i1 %icmp5.8.i, label %bb12.8.i, label %bb8.8.i
+
+bb8.8.i:
+  br label %bb12.8.i
+
+bb12.8.i:
+  %load.8.i239 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.834.i, %spam.exit ]
+  %load.7.i225 = phi i8 [ 0, %bb12.7.i ], [ %load.311.i, %bb8.8.i ], [ %load.7.i25, %spam.exit ]
+  %load.626.i208 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.6.i191 = phi i8 [ %load.311.i, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.5.i175 = phi i8 [ 0, %bb12.7.i ], [ %load.6.i183, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.418.i156 = phi i8 [ 0, %bb12.7.i ], [ %load.626.i200, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.4.i137 = phi i8 [ 0, %bb12.7.i ], [ %load.418.i148, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  %load.3.i119 = phi i8 [ 0, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ]
+  br label %bb2
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/pr165088.ll b/llvm/test/Transforms/SimplifyCFG/pr165088.ll
new file mode 100644
index 0000000..4514a19
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/pr165088.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes="simplifycfg<switch-range-to-icmp>" < %s | FileCheck %s
+
+; Avoid getting stuck in the cycle pr165088_cycle_[1-4].
+
+define void @pr165088_cycle_1(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_1(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND2:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %switch = icmp uge i8 %x, 2
+  %cond1 = icmp ugt i8 %x, 1
+  %or.cond = and i1 %switch, %cond1
+  br i1 %or.cond, label %block3, label %block2
+
+block1:
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:
+  br label %block3
+
+block3:
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:
+  ret void
+}
+
+define void @pr165088_cycle_2(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND2:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i8 %x, label %block3 [
+  i8 1, label %block2
+  i8 0, label %block2
+  ]
+
+block1:                                              ; preds = %block3
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:                                              ; preds = %entry, %entry, %block1
+  br label %block3
+
+block3:                                              ; preds = %entry, %block2, %block1
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
+
+define void @pr165088_cycle_3(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_3(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK3]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i8 %x, label %block1 [
+  i8 1, label %block2
+  i8 0, label %block2
+  ]
+
+block1:                                              ; preds = %entry, %block3
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:                                              ; preds = %entry, %entry, %block1
+  br label %block3
+
+block3:                                              ; preds = %block2, %block1
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
+
+define void @pr165088_cycle_4(i8 %x) {
+; CHECK-LABEL: define void @pr165088_cycle_4(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND2_OLD:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND2_OLD]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %switch = icmp ult i8 %x, 2
+  br i1 %switch, label %block2, label %block1
+
+block1:                                              ; preds = %entry, %block3
+  %cond2 = icmp ugt i8 %x, 1
+  br i1 %cond2, label %block3, label %block2
+
+block2:                                              ; preds = %entry, %block1
+  br label %block3
+
+block3:                                              ; preds = %block2, %block1
+  %cond3 = icmp eq i8 %x, 0
+  br i1 %cond3, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
+
+define void @pr165088_original(i8 %x) {
+; CHECK-LABEL: define void @pr165088_original(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i8 [[X]], 2
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]]
+; CHECK:       [[BLOCK1:.*]]:
+; CHECK-NEXT:    [[COND3_OLD_OLD:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[COND3_OLD_OLD]], label %[[BLOCK3]], label %[[BLOCK2]]
+; CHECK:       [[BLOCK2]]:
+; CHECK-NEXT:    br label %[[BLOCK3]]
+; CHECK:       [[BLOCK3]]:
+; CHECK-NEXT:    [[COND4:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND4]], label %[[EXIT:.*]], label %[[BLOCK1]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp ne i8 %x, 0
+  %cond3 = icmp ne i8 %x, 0
+  %or.cond = and i1 %cond, %cond3
+  br i1 %or.cond, label %block3, label %block2
+
+block1:                                              ; preds = %block3
+  %cond3.old = icmp ugt i8 %x, 1
+  br i1 %cond3.old, label %block3, label %block2
+
+block2:                                              ; preds = %block1, %entry
+  br label %block3
+
+block3:                                              ; preds = %block2, %block1, %entry
+  %cond4 = icmp eq i8 %x, 0
+  br i1 %cond4, label %exit, label %block1
+
+exit:                                             ; preds = %block3
+  ret void
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll
new file mode 100644
index 0000000..a804225
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces.
+
+define i8 @testi8(i8 %x) {
+  switch i8 %x, label %default [
+    i8 0, label %case1
+    i8 1, label %case2
+    i8 2, label %case3
+    i8 3, label %case3
+  ]
+default:
+  ret i8 0
+case1:
+  ret i8 1
+case2:
+  ret i8 2
+case3:
+  ret i8 3
+}
+
+define i32 @testi32(i32 %x) {
+  switch i32 %x, label %default [
+    i32 0, label %case1
+    i32 1, label %case2
+    i32 2, label %case3
+    i32 3, label %case3
+  ]
+default:
+  ret i32 0
+case1:
+  ret i32 1
+case2:
+  ret i32 2
+case3:
+  ret i32 3
+}
+
+define i128 @testi128(i128 %x) {
+  switch i128 %x, label %default [
+    i128 0, label %case1
+    i128 1, label %case2
+    i128 2, label %case3
+    i128 3, label %case3
+  ]
+default:
+  ret i128 0
+case1:
+  ret i128 1
+case2:
+  ret i128 2
+case3:
+  ret i128 3
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
new file mode 100644
index 0000000..b1977e7
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether the UTC format the switch-cases correctly, which requires TWO extra spaces.
+
+define i8 @testi8(i8 %x) {
+; CHECK-LABEL: define i8 @testi8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    switch i8 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i8 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i8 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i8 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i8 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    ret i8 0
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    ret i8 1
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    ret i8 2
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    ret i8 3
+;
+  switch i8 %x, label %default [
+    i8 0, label %case1
+    i8 1, label %case2
+    i8 2, label %case3
+    i8 3, label %case3
+  ]
+default:
+  ret i8 0
+case1:
+  ret i8 1
+case2:
+  ret i8 2
+case3:
+  ret i8 3
+}
+
+define i32 @testi32(i32 %x) {
+; CHECK-LABEL: define i32 @testi32(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    switch i32 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i32 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    ret i32 3
+;
+  switch i32 %x, label %default [
+    i32 0, label %case1
+    i32 1, label %case2
+    i32 2, label %case3
+    i32 3, label %case3
+  ]
+default:
+  ret i32 0
+case1:
+  ret i32 1
+case2:
+  ret i32 2
+case3:
+  ret i32 3
+}
+
+define i128 @testi128(i128 %x) {
+; CHECK-LABEL: define i128 @testi128(
+; CHECK-SAME: i128 [[X:%.*]]) {
+; CHECK-NEXT:    switch i128 [[X]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i128 0, label %[[CASE1:.*]]
+; CHECK-NEXT:      i128 1, label %[[CASE2:.*]]
+; CHECK-NEXT:      i128 2, label %[[CASE3:.*]]
+; CHECK-NEXT:      i128 3, label %[[CASE3]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    ret i128 0
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    ret i128 1
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    ret i128 2
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    ret i128 3
+;
+  switch i128 %x, label %default [
+    i128 0, label %case1
+    i128 1, label %case2
+    i128 2, label %case3
+    i128 3, label %case3
+  ]
+default:
+  ret i128 0
+case1:
+  ret i128 1
+case2:
+  ret i128 2
+case3:
+  ret i128 3
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test
new file mode 100644
index 0000000..891dbe0
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/switch_case.test
@@ -0,0 +1,3 @@
+## switch_case test checking that update_test_checks.py works correctly
+# RUN: cp -f %S/Inputs/switch_case.ll %t.ll && %update_test_checks %t.ll --version 7
+# RUN: diff -u %t.ll %S/Inputs/switch_case.ll.expected
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
index becd9d1..519edf04 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/type_units_split_dwp_v4.s
@@ -1,6 +1,12 @@
 ## This test uses TU index for type parsing in dwp and makes sure the DWARF4 type is
 ## successfully retrieved.
 
+## cd to a unique dir so we can refer to the file as just "test.dwo" in the
+## assembly test input below.
+# RUN: rm -rf %t
+# RUN: mkdir %t
+# RUN: cd %t
+
 # RUN: llvm-mc %s --split-dwarf-file=test.dwo -filetype obj -triple x86_64 -o test.o
 # RUN: llvm-dwp -e test.o -o test.dwp
 # RUN: llvm-dwarfdump test.dwp | FileCheck %s
diff --git a/llvm/unittests/IR/AbstractCallSiteTest.cpp b/llvm/unittests/IR/AbstractCallSiteTest.cpp
index ddb1091..623d1b3 100644
--- a/llvm/unittests/IR/AbstractCallSiteTest.cpp
+++ b/llvm/unittests/IR/AbstractCallSiteTest.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/AbstractCallSite.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
@@ -51,5 +52,96 @@ TEST(AbstractCallSite, CallbackCall) {
   EXPECT_TRUE(ACS);
   EXPECT_TRUE(ACS.isCallbackCall());
   EXPECT_TRUE(ACS.isCallee(CallbackUse));
+  EXPECT_EQ(ACS.getCalleeUseForCallback(), *CallbackUse);
   EXPECT_EQ(ACS.getCalledFunction(), Callback);
+
+  // The callback metadata {CallbackNo, Arg0No, ..., isVarArg} = {1, -1, true}
+  EXPECT_EQ(ACS.getCallArgOperandNoForCallee(), 1);
+  // Though the callback metadata only specifies ONE unfixed argument No, the
+  // callback callee is vararg, hence the third arg is also considered as
+  // another arg for the callback.
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Argument *Param0 = Callback->getArg(0), *Param1 = Callback->getArg(1);
+  ASSERT_TRUE(Param0 && Param1);
+  EXPECT_EQ(ACS.getCallArgOperandNo(*Param0), -1);
+  EXPECT_EQ(ACS.getCallArgOperandNo(*Param1), 2);
+}
+
+TEST(AbstractCallSite, DirectCall) {
+  LLVMContext C;
+
+  const char *IR = "declare void @bar(i32 %x, i32 %y)\n"
+                   "define void @foo() {\n"
+                   "  call void @bar(i32 1, i32 2)\n"
+                   "  ret void\n"
+                   "}\n";
+
+  std::unique_ptr<Module> M = parseIR(C, IR);
+  ASSERT_TRUE(M);
+
+  Function *Callee = M->getFunction("bar");
+  ASSERT_NE(Callee, nullptr);
+
+  const Use *DirectCallUse = Callee->getSingleUndroppableUse();
+  ASSERT_NE(DirectCallUse, nullptr);
+
+  AbstractCallSite ACS(DirectCallUse);
+  EXPECT_TRUE(ACS);
+  EXPECT_TRUE(ACS.isDirectCall());
+  EXPECT_TRUE(ACS.isCallee(DirectCallUse));
+  EXPECT_EQ(ACS.getCalledFunction(), Callee);
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Argument *ArgX = Callee->getArg(0);
+  ASSERT_NE(ArgX, nullptr);
+  Value *CAO1 = ACS.getCallArgOperand(*ArgX);
+  Value *CAO2 = ACS.getCallArgOperand(0);
+  ASSERT_NE(CAO2, nullptr);
+  // The two call arg operands should be the same object, since they are both
+  // the first argument of the call.
+  EXPECT_EQ(CAO2, CAO1);
+
+  ConstantInt *FirstArgInt = dyn_cast<ConstantInt>(CAO2);
+  ASSERT_NE(FirstArgInt, nullptr);
+  EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull);
+
+  EXPECT_EQ(ACS.getCallArgOperandNo(*ArgX), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(0), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(1), 1);
+}
+
+TEST(AbstractCallSite, IndirectCall) {
+  LLVMContext C;
+
+  const char *IR = "define void @foo(ptr %0) {\n"
+                   "  call void %0(i32 1, i32 2)\n"
+                   "  ret void\n"
+                   "}\n";
+
+  std::unique_ptr<Module> M = parseIR(C, IR);
+  ASSERT_TRUE(M);
+
+  Function *Fun = M->getFunction("foo");
+  ASSERT_NE(Fun, nullptr);
+
+  Argument *ArgAsCallee = Fun->getArg(0);
+  ASSERT_NE(ArgAsCallee, nullptr);
+
+  const Use *IndCallUse = ArgAsCallee->getSingleUndroppableUse();
+  ASSERT_NE(IndCallUse, nullptr);
+
+  AbstractCallSite ACS(IndCallUse);
+  EXPECT_TRUE(ACS);
+  EXPECT_TRUE(ACS.isIndirectCall());
+  EXPECT_TRUE(ACS.isCallee(IndCallUse));
+  EXPECT_EQ(ACS.getCalledFunction(), nullptr);
+  EXPECT_EQ(ACS.getCalledOperand(), ArgAsCallee);
+  EXPECT_EQ(ACS.getNumArgOperands(), 2u);
+  Value *CalledOperand = ACS.getCallArgOperand(0);
+  ASSERT_NE(CalledOperand, nullptr);
+  ConstantInt *FirstArgInt = dyn_cast<ConstantInt>(CalledOperand);
+  ASSERT_NE(FirstArgInt, nullptr);
+  EXPECT_EQ(FirstArgInt->getZExtValue(), 1ull);
+
+  EXPECT_EQ(ACS.getCallArgOperandNo(0), 0);
+  EXPECT_EQ(ACS.getCallArgOperandNo(1), 1);
 }
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index dd17844..8641b93 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -914,7 +914,7 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
   ASSERT_THAT(ValueData, SizeIs(0));
 
   // Remove the MD_prof metadata
-  Inst->setMetadata(LLVMContext::MD_prof, 0);
+  Inst->setMetadata(LLVMContext::MD_prof, nullptr);
   // Annotate 5 records this time.
   annotateValueSite(*M, *Inst, R.get(), IPVK_IndirectCallTarget, 0, 5);
   ValueData = getValueProfDataFromInst(*Inst, IPVK_IndirectCallTarget, 5, T);
@@ -932,7 +932,7 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
   ASSERT_EQ(2U, ValueData[4].Count);
 
   // Remove the MD_prof metadata
-  Inst->setMetadata(LLVMContext::MD_prof, 0);
+  Inst->setMetadata(LLVMContext::MD_prof, nullptr);
   // Annotate with 4 records.
   InstrProfValueData VD0Sorted[] = {{1000, 6}, {2000, 5}, {3000, 4}, {4000, 3},
                               {5000, 2}, {6000, 1}};
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index a5e3c39..8cd200c9 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -29,6 +29,7 @@ Version changelog:
    'none' and 'all'. 'smart' is the default.
 5: Basic block labels are matched by FileCheck expressions
 6: The semantics of TBAA checks has been incorporated in the check lines.
+7: Indent switch-cases correctly.
 """
 DEFAULT_VERSION = 6
 
@@ -606,6 +607,7 @@ MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
+IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\w+")
 
 SCRUB_LEADING_WHITESPACE_RE = re.compile(r"^(\s+)")
 SCRUB_WHITESPACE_RE = re.compile(r"(?!^(|  \w))[ \t]+", flags=re.M)
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 406a728..dff7f78 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -508,7 +508,7 @@ if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--token", type=str, required=True, help="GitHub authentiation token"
+        "--token", type=str, required=True, help="GitHub authentication token"
     )
     parser.add_argument(
         "--repo",
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index f883145..9fba96a 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -945,7 +945,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
             path = (
                 cmd_shenv.env["PATH"] if "PATH" in cmd_shenv.env else shenv.env["PATH"]
             )
-            executable = lit.util.which(args[0], shenv.env["PATH"])
+            executable = lit.util.which(args[0], path)
         if not executable:
             raise InternalShellError(j, "%r: command not found" % args[0])
 
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg
new file mode 100644
index 0000000..36517f9
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "shtest-env-path"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt
new file mode 100644
index 0000000..b36e861
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/path.txt
@@ -0,0 +1,8 @@
+## Tests env command for setting the PATH variable.
+
+## Check that test.sh can be found using the configured PATH.
+#
+# RUN: env PATH=%S test.sh | FileCheck --check-prefix=CHECK %s
+#
+
+# CHECK: TEST-ENV-PATH-123
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh
new file mode 100755
index 0000000..a1e46fc
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-env-path/test.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+echo "TEST-ENV-PATH-123"
+
diff --git a/llvm/utils/lit/tests/shtest-env-path.py b/llvm/utils/lit/tests/shtest-env-path.py
new file mode 100644
index 0000000..bf459ae
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-env-path.py
@@ -0,0 +1,13 @@
+## Tests env command for setting the PATH variable.
+
+# The test is using /bin/sh. Limit to system known to have /bin/sh.
+# REQUIRES: system-linux
+
+# RUN: %{lit} -a -v %{inputs}/shtest-env-path/path.txt \
+# RUN:   | FileCheck -match-full-lines %s
+#
+# END.
+
+# CHECK: -- Testing: 1 tests{{.*}}
+# CHECK: PASS: shtest-env-path :: path.txt (1 of 1)
+# CHECK: --
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 3d07b16..aef7c09 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -550,6 +550,7 @@ tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
 tools/UpdateTestChecks/update_test_checks/stable_ir_values.test
+tools/UpdateTestChecks/update_test_checks/switch_case.test
 tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test
 tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
 Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py
index 3b562fb..42227b2 100755
--- a/llvm/utils/update_test_checks.py
+++ b/llvm/utils/update_test_checks.py
@@ -260,9 +260,17 @@ def update_test(ti: common.TestInfo):
                 skip_same_checks=dropped_previous_line,
             ):
                 # This input line of the function body will go as-is into the output.
-                # Except make leading whitespace uniform: 2 spaces. 4 for debug records.
+                # Except make leading whitespace uniform: 2 spaces. 4 for debug records/switch cases.
                 indent = (
-                    "  " if not common.IS_DEBUG_RECORD_RE.match(input_line) else "    "
+                    " " * 4
+                    if (
+                        common.IS_DEBUG_RECORD_RE.match(input_line)
+                        or (
+                            ti.args.version > 6
+                            and common.IS_SWITCH_CASE_RE.match(input_line)
+                        )
+                    )
+                    else " " * 2
                 )
                 input_line = common.SCRUB_LEADING_WHITESPACE_RE.sub(indent, input_line)
                 output_lines.append(input_line)
diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
index 8bcfe51..3c87c45 100644
--- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
@@ -397,7 +397,7 @@ protected:
   /// itself.
   virtual void visitRegionBranchControlFlowTransfer(
       RegionBranchOpInterface branch, RegionBranchPoint regionFrom,
-      RegionBranchPoint regionTo, const AbstractDenseLattice &after,
+      RegionSuccessor regionTo, const AbstractDenseLattice &after,
       AbstractDenseLattice *before) {
     meet(before, after);
   }
@@ -526,7 +526,7 @@ public:
   /// and "to" regions.
   virtual void visitRegionBranchControlFlowTransfer(
       RegionBranchOpInterface branch, RegionBranchPoint regionFrom,
-      RegionBranchPoint regionTo, const LatticeT &after, LatticeT *before) {
+      RegionSuccessor regionTo, const LatticeT &after, LatticeT *before) {
     AbstractDenseBackwardDataFlowAnalysis::visitRegionBranchControlFlowTransfer(
         branch, regionFrom, regionTo, after, before);
   }
@@ -571,7 +571,7 @@ protected:
   }
   void visitRegionBranchControlFlowTransfer(
       RegionBranchOpInterface branch, RegionBranchPoint regionForm,
-      RegionBranchPoint regionTo, const AbstractDenseLattice &after,
+      RegionSuccessor regionTo, const AbstractDenseLattice &after,
       AbstractDenseLattice *before) final {
     visitRegionBranchControlFlowTransfer(branch, regionForm, regionTo,
                                          static_cast<const LatticeT &>(after),
diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
index 1a33ecf..9855734 100644
--- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
@@ -286,7 +286,7 @@ private:
   /// and propagating therefrom.
   virtual void
   visitRegionSuccessors(ProgramPoint *point, RegionBranchOpInterface branch,
-                        RegionBranchPoint successor,
+                        RegionSuccessor successor,
                         ArrayRef<AbstractSparseLattice *> lattices);
 };
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 37db096..45cb67f 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -912,9 +912,10 @@ def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN
                                    VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
 def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>;
 // wmma
-def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>,
-                             VectorOfLengthAndType<[4, 8, 16], [I8, SI8, UI8]>,
-                             VectorOfLengthAndType<[4, 8], [F8E4M3FN, F8E5M2]>,
+def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[2], [F32]>,
+                             VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>,
+                             VectorOfLengthAndType<[4, 8, 16, 32], [I8, SI8, UI8]>,
+                             VectorOfLengthAndType<[4, 8, 32, 64], [F8E4M3FN, F8E5M2]>,
                              VectorOfLengthAndType<[4, 8, 16], [I<4>, SI<4>, UI<4>]>]>;
 def WMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8], [F32, I32]>,
                               VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>]>;
@@ -992,7 +993,7 @@ def AMDGPU_WMMAOp :
     Arguments<(ins
                    ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$m,
                    ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$n,
-                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$k,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[4, 16, 32, 64, 128]>]>:$k,
                    WMMAInTypes:$sourceA,
                    WMMAInTypes:$sourceB,
                    WMMAOutTypes:$destC,
@@ -1005,8 +1006,14 @@ def AMDGPU_WMMAOp :
   let description = [{
     The `amdgpu.wmma` op is an MLIR wrapper around intrinsics for various `wmma`
     instructions in the AMDGPU architecture, which perform matrix multiplication.
-    Note that all wmma intrinsics have M=N=16 dimensions but vary by in allowed K
-    dimensions.
+
+    On gfx11/RDNA3, wmma intrinsics have M=N=K=16 dimensions.
+
+    On gfx12/RDNA4, wmma intrinsics have M=N=16 dimensions and support K=16 for
+    all element types, and K=32 for i4 sources.
+
+    On gfx1250, wmma intrinsics have M=N=16 and K dimensions of 4, 32, 64, or 128,
+    depending on the element types.
 
     On gfx11/RDNA3, emitting f16->f16 (or bf16->bf16) wmma the output is a 16xf16
     (or 16xbf16) vector containing only 8 valid values:
@@ -1022,7 +1029,13 @@ def AMDGPU_WMMAOp :
 
     Example:
     ```mlir
-      %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<16xf16>, vector<16xf16>, vector<8xf16>
+      %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<8xf16>, vector<8xf16>, vector<8xf16>
+
+      %1 = amdgpu.wmma 16x16x64 %matD * %matE + %matF : vector<32xi8>, vector<8xf32>, vector<8xf32>
+
+      %2 = amdgpu.wmma 16x16x128 %matG * %matH + %matI : vector<64xf4E2M1FN>, vector<64xf4E2M1FN>, vector<8xf32>
+
+      %3 = amdgpu.wmma 16x16x4 %matJ * %matK + %matL : vector<2xf32>, vector<2xf32>, vector<8xf32>
     ```
   }];
   let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 12a7935..409bd05 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -714,7 +714,7 @@ def AffineParallelOp : Affine_Op<"parallel",
     operand_range getUpperBoundsOperands();
     AffineValueMap getUpperBoundsValueMap();
 
-    /// Sets elements fo the loop upper bound.
+    /// Sets elements of the loop upper bound.
     void setUpperBounds(ValueRange operands, AffineMap map);
 
     void setSteps(ArrayRef<int64_t> newSteps);
@@ -999,7 +999,7 @@ def AffineVectorStoreOp : AffineStoreOpBase<"vector_store"> {
     elemental type, supplied as its second operand.
     The index for each memref dimension is an affine expression of loop
     induction variables and symbols. These indices determine the start position
-    of the write within the memref. The shape of th input vector determines the
+    of the write within the memref. The shape of the input vector determines the
     shape of the slice written to the memref. This slice is contiguous along the
     respective dimensions of the shape. Strided vector stores will be supported
     in the future.
@@ -1188,7 +1188,7 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
     If all `N` basis elements are provided, the linearize_index operation is said to
     "have an outer bound".
 
-    As a convenience, and for symmetry with `getPaddedBasis()`, ifg the first
+    As a convenience, and for symmetry with `getPaddedBasis()`, if the first
     element of a set of `OpFoldResult`s passed to the builders of this operation is
     `nullptr`, that element is ignored.
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index d2df244..5241f9a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -147,6 +147,35 @@ class ROCDL_DimGetterFunctionOp<string mnemonic, string device_function,
 }
 
 //===----------------------------------------------------------------------===//
+// ROCDL vector types definitions
+//===----------------------------------------------------------------------===//
+
+class ROCDL_ConcreteVector<Type elem, int length> :
+  FixedVectorOfLengthAndType<[length], [elem]>,
+  BuildableType<
+    "::mlir::VectorType::get({" # length # "} ,"
+      # elem.builderCall # ")">;
+
+def ROCDL_V2I16Type : ROCDL_ConcreteVector<I16, 2>;
+def ROCDL_V2F16Type : ROCDL_ConcreteVector<F16, 2>;
+def ROCDL_V2I32Type : ROCDL_ConcreteVector<I32, 2>;
+def ROCDL_V2BF16Type : ROCDL_ConcreteVector<BF16, 2>;
+def ROCDL_V2F32Type : ROCDL_ConcreteVector<F32, 2>;
+def ROCDL_V3I32Type : ROCDL_ConcreteVector<I32, 3>;
+def ROCDL_V4I32Type : ROCDL_ConcreteVector<I32, 4>;
+def ROCDL_V6I32Type : ROCDL_ConcreteVector<I32, 6>;
+def ROCDL_V8I32Type : ROCDL_ConcreteVector<I32, 8>;
+def ROCDL_V8BF16Type : ROCDL_ConcreteVector<BF16, 8>;
+def ROCDL_V8F16Type : ROCDL_ConcreteVector<F16, 8>;
+def ROCDL_V8F32Type : ROCDL_ConcreteVector<F32, 8>;
+def ROCDL_V16BF16Type : ROCDL_ConcreteVector<BF16, 16>;
+def ROCDL_V16F16Type : ROCDL_ConcreteVector<F16, 16>;
+def ROCDL_V16F32Type : ROCDL_ConcreteVector<F32, 16>;
+def ROCDL_V32F16Type : ROCDL_ConcreteVector<F16, 32>;
+def ROCDL_V32BF16Type : ROCDL_ConcreteVector<BF16, 32>;
+def ROCDL_V32F32Type : ROCDL_ConcreteVector<F32, 32>;
+
+//===----------------------------------------------------------------------===//
 // Wave-level primitives
 //===----------------------------------------------------------------------===//
 
@@ -664,6 +693,68 @@ def ROCDL_GlobalLoadLDSOp :
 }
 
 //===---------------------------------------------------------------------===//
+// Tensor load/store intrinsics (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+// Base class for tensor load/store operations with 4 descriptor groups.
+class ROCDL_TensorLDSIntrOp<string mnemonic> :
+  ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [4], ["cachePolicy"]> {
+  dag args = (ins ROCDL_V4I32Type:$dgroup0, ROCDL_V8I32Type:$dgroup1,
+                  ROCDL_V4I32Type:$dgroup2, ROCDL_V4I32Type:$dgroup3,
+                  I32Attr:$cachePolicy);
+  let arguments = !con(args, baseArgs);
+  let summary = "Base class for ROCDL tensor load/store to/from LDS.";
+  let description = [{
+    Moves tiles of tensor data between global memory and LDS. The tile is
+    described by the $dgroup descriptors. 4 $dgroup descriptors allows for
+    movement of up to 5D tensors. $cachePolicy describes the memory scope and an
+    indicator of expected data re-use.
+
+    This op is for gfx1250+ architectures.
+  }];
+  let assemblyFormat = [{
+    attr-dict operands `cachepolicy` $cachePolicy `:` type($dgroup0) `,` type($dgroup1)
+  }];
+  let extraClassDefinition = [{
+    SmallVector<Value> $cppClass::getAccessedOperands() {
+      return {getDgroup0(), getDgroup1(), getDgroup2(), getDgroup3()};
+    }
+  }];
+}
+
+// Base class for tensor load/store operations with 2 descriptor groups
+// (D2 variant).
+class ROCDL_TensorLDSIntrD2Op<string mnemonic> :
+  ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2], ["cachePolicy"]> {
+  dag args = (ins ROCDL_V4I32Type:$dgroup0, ROCDL_V8I32Type:$dgroup1,
+                  I32Attr:$cachePolicy);
+  let arguments = !con(args, baseArgs);
+  let summary = "Base class for ROCDL tensor load/store to/from LDS (D2 variant).";
+  let description = [{
+    Moves tiles of tensor data between global memory and LDS. The tile is
+    described by the $dgroup descriptors. 2 $dgroup descriptors allows for
+    movement of up to 2D tensors. $cachePolicy describes the memory scope and an
+    indicator of expected data re-use.
+
+    This op is for gfx1250+ architectures.
+  }];
+  let assemblyFormat = [{
+    attr-dict operands `cachepolicy` $cachePolicy `:` type($dgroup0) `,` type($dgroup1)
+  }];
+  let extraClassDefinition = [{
+    SmallVector<Value> $cppClass::getAccessedOperands() {
+      return {getDgroup0(), getDgroup1()};
+    }
+  }];
+}
+
+// Tensor load and store operations
+def ROCDL_TensorLoadToLDSOp : ROCDL_TensorLDSIntrOp<"tensor.load.to.lds">;
+def ROCDL_TensorStoreFromLDSOp : ROCDL_TensorLDSIntrOp<"tensor.store.from.lds">;
+def ROCDL_TensorLoadToLDSD2Op : ROCDL_TensorLDSIntrD2Op<"tensor.load.to.lds.d2">;
+def ROCDL_TensorStoreFromLDSD2Op : ROCDL_TensorLDSIntrD2Op<"tensor.store.from.lds.d2">;
+
+//===---------------------------------------------------------------------===//
 // Operations on raw buffer resources (stride of 0, bounds checks either off or in
 // raw buffer mode).
 //===---------------------------------------------------------------------===//
@@ -932,30 +1023,6 @@ def ROCDL_Permlane32SwapOp : ROCDL_IntrOp<"permlane32.swap", [], [],
   }];
 }
 
-class ROCDL_ConcreteVector<Type elem, int length> :
-  FixedVectorOfLengthAndType<[length], [elem]>,
-  BuildableType<
-    "::mlir::VectorType::get({" # length # "} ,"
-      # elem.builderCall # ")">;
-
-def ROCDL_V2I16Type : ROCDL_ConcreteVector<I16, 2>;
-def ROCDL_V2F16Type : ROCDL_ConcreteVector<F16, 2>;
-def ROCDL_V2I32Type : ROCDL_ConcreteVector<I32, 2>;
-def ROCDL_V2BF16Type : ROCDL_ConcreteVector<BF16, 2>;
-def ROCDL_V2F32Type : ROCDL_ConcreteVector<F32, 2>;
-def ROCDL_V3I32Type : ROCDL_ConcreteVector<I32, 3>;
-def ROCDL_V6I32Type : ROCDL_ConcreteVector<I32, 6>;
-def ROCDL_V8I32Type : ROCDL_ConcreteVector<I32, 8>;
-def ROCDL_V8BF16Type : ROCDL_ConcreteVector<BF16, 8>;
-def ROCDL_V8F16Type : ROCDL_ConcreteVector<F16, 8>;
-def ROCDL_V8F32Type : ROCDL_ConcreteVector<F32, 8>;
-def ROCDL_V16BF16Type : ROCDL_ConcreteVector<BF16, 16>;
-def ROCDL_V16F16Type : ROCDL_ConcreteVector<F16, 16>;
-def ROCDL_V16F32Type : ROCDL_ConcreteVector<F32, 16>;
-def ROCDL_V32F16Type : ROCDL_ConcreteVector<F16, 32>;
-def ROCDL_V32BF16Type : ROCDL_ConcreteVector<BF16, 32>;
-def ROCDL_V32F32Type : ROCDL_ConcreteVector<F32, 32>;
-
 //===---------------------------------------------------------------------===//
 // 16-bit float intrinsics
 //===---------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 2f87975..a18c18a 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2117,6 +2117,56 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
 }
 
 //===----------------------------------------------------------------------===//
+// acc.kernel_environment
+//===----------------------------------------------------------------------===//
+
+def OpenACC_KernelEnvironmentOp : OpenACC_Op<"kernel_environment",
+    [AttrSizedOperandSegments, RecursiveMemoryEffects, SingleBlock,
+     NoTerminator,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
+  let summary = "Decomposition of compute constructs to capture data mapping "
+                "and asynchronous behavior information";
+  let description = [{
+    The `acc.kernel_environment` operation represents a decomposition of
+    any OpenACC compute construct (acc.kernels, acc.parallel, or
+    acc.serial) that captures data mapping and asynchronous behavior:
+    - data clause operands
+    - async clause operands
+    - wait clause operands
+
+    This allows kernel execution parallelism and privatization to be
+    handled separately, facilitating eventual lowering to GPU dialect where
+    kernel launching and compute offloading are handled separately.
+  }];
+
+  let arguments = (ins
+    Variadic<AnyType>:$dataClauseOperands,
+    Variadic<IntOrIndex>:$asyncOperands,
+    OptionalAttr<DeviceTypeArrayAttr>:$asyncOperandsDeviceType,
+    OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+    Variadic<IntOrIndex>:$waitOperands,
+    OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+    OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+    OptionalAttr<BoolArrayAttr>:$hasWaitDevnum,
+    OptionalAttr<DeviceTypeArrayAttr>:$waitOnly);
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let assemblyFormat = [{
+    oilist(
+        `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
+      | `async` `` custom<DeviceTypeOperandsWithKeywordOnly>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly)
+      | `wait` `` custom<WaitClause>($waitOperands, type($waitOperands),
+          $waitOperandsDeviceType, $waitOperandsSegments, $hasWaitDevnum,
+          $waitOnly)
+    )
+    $region attr-dict
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // 2.6.5 data Construct
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index fadd3fc..cd033c1 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -77,7 +77,7 @@ def ConditionOp : SCF_Op<"condition", [
 //===----------------------------------------------------------------------===//
 
 def ExecuteRegionOp : SCF_Op<"execute_region", [
-    DeclareOpInterfaceMethods<RegionBranchOpInterface>]> {
+    DeclareOpInterfaceMethods<RegionBranchOpInterface>, RecursiveMemoryEffects]> {
   let summary = "operation that executes its region exactly once";
   let description = [{
     The `scf.execute_region` operation is used to allow multiple blocks within SCF
@@ -644,6 +644,13 @@ def ForallOp : SCF_Op<"forall", [
 
     /// Returns true if the mapping specified for this forall op is linear.
     bool usesLinearMapping();
+
+    /// RegionBranchOpInterface
+
+    OperandRange getEntrySuccessorOperands(RegionSuccessor successor) {
+      return getInits();
+    }
+
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 62e66b3..ed69287 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -25,7 +25,7 @@ include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 
 def AlternativesOp : TransformDialectOp<"alternatives",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
-        ["getEntrySuccessorOperands", "getSuccessorRegions",
+        ["getEntrySuccessorOperands",
          "getRegionInvocationBounds"]>,
      DeclareOpInterfaceMethods<TransformOpInterface>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
@@ -624,7 +624,7 @@ def ForeachOp : TransformDialectOp<"foreach",
     [DeclareOpInterfaceMethods<TransformOpInterface>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
      DeclareOpInterfaceMethods<RegionBranchOpInterface, [
-         "getSuccessorRegions", "getEntrySuccessorOperands"]>,
+         "getEntrySuccessorOperands"]>,
      SingleBlockImplicitTerminator<"::mlir::transform::YieldOp">
     ]> {
   let summary = "Executes the body for each element of the payload";
@@ -1237,7 +1237,7 @@ def SelectOp : TransformDialectOp<"select",
 
 def SequenceOp : TransformDialectOp<"sequence",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
-        ["getEntrySuccessorOperands", "getSuccessorRegions",
+        ["getEntrySuccessorOperands",
          "getRegionInvocationBounds"]>,
      MatchOpInterface,
      DeclareOpInterfaceMethods<TransformOpInterface>,
diff --git a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
index d095659..4079848 100644
--- a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
@@ -63,7 +63,7 @@ def KnobOp : Op<Transform_Dialect, "tune.knob", [
 
 def AlternativesOp : Op<Transform_Dialect, "tune.alternatives", [
   DeclareOpInterfaceMethods<RegionBranchOpInterface,
-        ["getEntrySuccessorOperands", "getSuccessorRegions",
+        ["getEntrySuccessorOperands",
          "getRegionInvocationBounds"]>,
   DeclareOpInterfaceMethods<TransformOpInterface>,
   DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h
index 7ff718a..a0a99f4 100644
--- a/mlir/include/mlir/IR/Diagnostics.h
+++ b/mlir/include/mlir/IR/Diagnostics.h
@@ -29,6 +29,7 @@ class MLIRContext;
 class Operation;
 class OperationName;
 class OpPrintingFlags;
+class OpWithFlags;
 class Type;
 class Value;
 
@@ -199,6 +200,7 @@ public:
 
   /// Stream in an Operation.
   Diagnostic &operator<<(Operation &op);
+  Diagnostic &operator<<(OpWithFlags op);
   Diagnostic &operator<<(Operation *op) { return *this << *op; }
   /// Append an operation with the given printing flags.
   Diagnostic &appendOp(Operation &op, const OpPrintingFlags &flags);
diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index 5569392c..b201957 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -1114,6 +1114,7 @@ public:
       : op(op), theFlags(flags) {}
   OpPrintingFlags &flags() { return theFlags; }
   const OpPrintingFlags &flags() const { return theFlags; }
+  Operation *getOperation() const { return op; }
 
 private:
   Operation *op;
diff --git a/mlir/include/mlir/IR/Region.h b/mlir/include/mlir/IR/Region.h
index 1fcb316..53d461d 100644
--- a/mlir/include/mlir/IR/Region.h
+++ b/mlir/include/mlir/IR/Region.h
@@ -379,6 +379,8 @@ private:
   friend RangeBaseT;
 };
 
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Region &region);
+
 } // namespace mlir
 
 #endif // MLIR_IR_REGION_H
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
index d63800c..47afd25 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
@@ -15,10 +15,16 @@
 #define MLIR_INTERFACES_CONTROLFLOWINTERFACES_H
 
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 class BranchOpInterface;
 class RegionBranchOpInterface;
+class RegionBranchTerminatorOpInterface;
 
 /// This class models how operands are forwarded to block arguments in control
 /// flow. It consists of a number, denoting how many of the successors block
@@ -186,27 +192,40 @@ class RegionSuccessor {
 public:
   /// Initialize a successor that branches to another region of the parent
   /// operation.
+  /// TODO: the default value for the regionInputs is somehow broken.
+  /// A region successor should have its input correctly set.
   RegionSuccessor(Region *region, Block::BlockArgListType regionInputs = {})
-      : region(region), inputs(regionInputs) {}
+      : successor(region), inputs(regionInputs) {
+    assert(region && "Region must not be null");
+  }
   /// Initialize a successor that branches back to/out of the parent operation.
-  RegionSuccessor(Operation::result_range results)
-      : inputs(ValueRange(results)) {}
-  /// Constructor with no arguments.
-  RegionSuccessor() : inputs(ValueRange()) {}
+  /// The target must be one of the recursive parent operations.
+  RegionSuccessor(Operation *successorOp, Operation::result_range results)
+      : successor(successorOp), inputs(ValueRange(results)) {
+    assert(successorOp && "Successor op must not be null");
+  }
 
   /// Return the given region successor. Returns nullptr if the successor is the
   /// parent operation.
-  Region *getSuccessor() const { return region; }
+  Region *getSuccessor() const { return dyn_cast<Region *>(successor); }
 
   /// Return true if the successor is the parent operation.
-  bool isParent() const { return region == nullptr; }
+  bool isParent() const { return isa<Operation *>(successor); }
 
   /// Return the inputs to the successor that are remapped by the exit values of
   /// the current region.
   ValueRange getSuccessorInputs() const { return inputs; }
 
+  bool operator==(RegionSuccessor rhs) const {
+    return successor == rhs.successor && inputs == rhs.inputs;
+  }
+
+  friend bool operator!=(RegionSuccessor lhs, RegionSuccessor rhs) {
+    return !(lhs == rhs);
+  }
+
 private:
-  Region *region{nullptr};
+  llvm::PointerUnion<Region *, Operation *> successor{nullptr};
   ValueRange inputs;
 };
 
@@ -214,64 +233,67 @@ private:
 /// `RegionBranchOpInterface`.
 /// One can branch from one of two kinds of places:
 /// * The parent operation (aka the `RegionBranchOpInterface` implementation)
-/// * A region within the parent operation.
+/// * A RegionBranchTerminatorOpInterface inside a region within the parent
+//    operation.
 class RegionBranchPoint {
 public:
   /// Returns an instance of `RegionBranchPoint` representing the parent
   /// operation.
   static constexpr RegionBranchPoint parent() { return RegionBranchPoint(); }
 
-  /// Creates a `RegionBranchPoint` that branches from the given region.
-  /// The pointer must not be null.
-  RegionBranchPoint(Region *region) : maybeRegion(region) {
-    assert(region && "Region must not be null");
-  }
-
-  RegionBranchPoint(Region &region) : RegionBranchPoint(&region) {}
+  /// Creates a `RegionBranchPoint` that branches from the given terminator.
+  inline RegionBranchPoint(RegionBranchTerminatorOpInterface predecessor);
 
   /// Explicitly stops users from constructing with `nullptr`.
   RegionBranchPoint(std::nullptr_t) = delete;
 
-  /// Constructs a `RegionBranchPoint` from the the target of a
-  /// `RegionSuccessor` instance.
-  RegionBranchPoint(RegionSuccessor successor) {
-    if (successor.isParent())
-      maybeRegion = nullptr;
-    else
-      maybeRegion = successor.getSuccessor();
-  }
-
-  /// Assigns a region being branched from.
-  RegionBranchPoint &operator=(Region &region) {
-    maybeRegion = &region;
-    return *this;
-  }
-
   /// Returns true if branching from the parent op.
-  bool isParent() const { return maybeRegion == nullptr; }
+  bool isParent() const { return predecessor == nullptr; }
 
-  /// Returns the region if branching from a region.
+  /// Returns the terminator if branching from a region.
   /// A null pointer otherwise.
-  Region *getRegionOrNull() const { return maybeRegion; }
+  Operation *getTerminatorPredecessorOrNull() const { return predecessor; }
 
   /// Returns true if the two branch points are equal.
   friend bool operator==(RegionBranchPoint lhs, RegionBranchPoint rhs) {
-    return lhs.maybeRegion == rhs.maybeRegion;
+    return lhs.predecessor == rhs.predecessor;
   }
 
 private:
   // Private constructor to encourage the use of `RegionBranchPoint::parent`.
-  constexpr RegionBranchPoint() : maybeRegion(nullptr) {}
+  constexpr RegionBranchPoint() = default;
 
   /// Internal encoding. Uses nullptr for representing branching from the parent
-  /// op and the region being branched from otherwise.
-  Region *maybeRegion;
+  /// op and the region terminator being branched from otherwise.
+  Operation *predecessor = nullptr;
 };
 
 inline bool operator!=(RegionBranchPoint lhs, RegionBranchPoint rhs) {
   return !(lhs == rhs);
 }
 
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                     RegionBranchPoint point) {
+  if (point.isParent())
+    return os << "<from parent>";
+  return os << "<region #"
+            << point.getTerminatorPredecessorOrNull()
+                   ->getParentRegion()
+                   ->getRegionNumber()
+            << ", terminator "
+            << OpWithFlags(point.getTerminatorPredecessorOrNull(),
+                           OpPrintingFlags().skipRegions())
+            << ">";
+}
+
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                     RegionSuccessor successor) {
+  if (successor.isParent())
+    return os << "<to parent>";
+  return os << "<to region #" << successor.getSuccessor()->getRegionNumber()
+            << " with " << successor.getSuccessorInputs().size() << " inputs>";
+}
+
 /// This class represents upper and lower bounds on the number of times a region
 /// of a `RegionBranchOpInterface` can be invoked. The lower bound is at least
 /// zero, but the upper bound may not be known.
@@ -348,4 +370,10 @@ struct ReturnLike : public TraitBase<ConcreteType, ReturnLike> {
 /// Include the generated interface declarations.
 #include "mlir/Interfaces/ControlFlowInterfaces.h.inc"
 
+namespace mlir {
+inline RegionBranchPoint::RegionBranchPoint(
+    RegionBranchTerminatorOpInterface predecessor)
+    : predecessor(predecessor.getOperation()) {}
+} // namespace mlir
+
 #endif // MLIR_INTERFACES_CONTROLFLOWINTERFACES_H
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
index b8d08cc..94242e3 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
@@ -117,7 +117,7 @@ def BranchOpInterface : OpInterface<"BranchOpInterface"> {
 
 def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
   let description = [{
-    This interface provides information for region operations that exhibit
+    This interface provides information for region-holding operations that exhibit
     branching behavior between held regions. I.e., this interface allows for
     expressing control flow information for region holding operations.
 
@@ -126,12 +126,12 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
     be side-effect free.
 
     A "region branch point" indicates a point from which a branch originates. It
-    can indicate either a region of this op or `RegionBranchPoint::parent()`. In
-    the latter case, the branch originates from outside of the op, i.e., when
-    first executing this op.
+    can indicate either a terminator in any of the immediately nested region of
+    this op or `RegionBranchPoint::parent()`. In the latter case, the branch
+    originates from outside of the op, i.e., when first executing this op.
 
     A "region successor" indicates the target of a branch. It can indicate
-    either a region of this op or this op. In the former case, the region
+    either a region of this op or this op itself. In the former case, the region
     successor is a region pointer and a range of block arguments to which the
     "successor operands" are forwarded to. In the latter case, the control flow
     leaves this op and the region successor is a range of results of this op to
@@ -151,10 +151,10 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
     }
     ```
 
-    `scf.for` has one region. The region has two region successors: the region
-    itself and the `scf.for` op. %b is an entry successor operand. %c is a
-    successor operand. %a is a successor block argument. %r is a successor
-    result.
+    `scf.for` has one region. The `scf.yield` has two region successors: the
+    region body itself and the `scf.for` op. `%b` is an entry successor
+    operand. `%c` is a successor operand. `%a` is a successor block argument.
+    `%r` is a successor result.
   }];
   let cppNamespace = "::mlir";
 
@@ -162,16 +162,16 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
     InterfaceMethod<[{
         Returns the operands of this operation that are forwarded to the region
         successor's block arguments or this operation's results when branching
-        to `point`. `point` is guaranteed to be among the successors that are
+        to `successor`. `successor` is guaranteed to be among the successors that are
         returned by `getEntrySuccessorRegions`/`getSuccessorRegions(parent())`.
 
         Example: In the above example, this method returns the operand %b of the
-        `scf.for` op, regardless of the value of `point`. I.e., this op always
+        `scf.for` op, regardless of the value of `successor`. I.e., this op always
         forwards the same operands, regardless of whether the loop has 0 or more
         iterations.
       }],
       "::mlir::OperandRange", "getEntrySuccessorOperands",
-      (ins "::mlir::RegionBranchPoint":$point), [{}],
+      (ins "::mlir::RegionSuccessor":$successor), [{}],
       /*defaultImplementation=*/[{
         auto operandEnd = this->getOperation()->operand_end();
         return ::mlir::OperandRange(operandEnd, operandEnd);
@@ -225,6 +225,80 @@ def RegionBranchOpInterface : OpInterface<"RegionBranchOpInterface"> {
            "::llvm::SmallVectorImpl<::mlir::RegionSuccessor> &":$regions)
     >,
     InterfaceMethod<[{
+        Returns the potential region successors when branching from any
+        terminator in `region`.
+        These are the regions that may be selected during the flow of control.
+      }],
+      "void", "getSuccessorRegions",
+      (ins "::mlir::Region&":$region,
+           "::llvm::SmallVectorImpl<::mlir::RegionSuccessor> &":$regions),
+      [{}],
+      /*defaultImplementation=*/[{
+        for (::mlir::Block &block : region) {
+          if (block.empty())
+            continue;
+          if (auto terminator =
+                  dyn_cast<RegionBranchTerminatorOpInterface>(block.back()))
+            $_op.getSuccessorRegions(RegionBranchPoint(terminator),
+                                     regions);
+        }
+      }]>,
+    InterfaceMethod<[{
+        Returns the potential branching point (predecessors) for a given successor.
+      }],
+      "void", "getPredecessors",
+      (ins "::mlir::RegionSuccessor":$successor,
+           "::llvm::SmallVectorImpl<::mlir::RegionBranchPoint> &":$predecessors),
+      [{}],
+      /*defaultImplementation=*/[{
+        ::llvm::SmallVector<::mlir::RegionSuccessor> successors;
+        $_op.getSuccessorRegions(RegionBranchPoint::parent(),
+                                 successors);
+        if (llvm::any_of(successors, [&] (const RegionSuccessor & succ) {
+            return succ.getSuccessor() == successor.getSuccessor() ||
+              (succ.isParent() && successor.isParent());
+          }))
+          predecessors.push_back(RegionBranchPoint::parent());
+        for (Region &region : $_op->getRegions()) {
+          for (::mlir::Block &block : region) {
+            if (block.empty())
+              continue;
+            if (auto terminator =
+                    dyn_cast<RegionBranchTerminatorOpInterface>(block.back())) {
+              ::llvm::SmallVector<::mlir::RegionSuccessor> successors;
+              $_op.getSuccessorRegions(RegionBranchPoint(terminator),
+                                       successors);
+              if (llvm::any_of(successors, [&] (const RegionSuccessor & succ) {
+                  return succ.getSuccessor() == successor.getSuccessor() ||
+                    (succ.isParent() && successor.isParent());
+                }))
+                predecessors.push_back(terminator);
+            }
+          }
+        }
+      }]>,
+    InterfaceMethod<[{
+        Returns the potential values across all (predecessors) for a given successor
+        input, modeled by its index (its position in the list of values).
+      }],
+      "void", "getPredecessorValues",
+      (ins "::mlir::RegionSuccessor":$successor,
+           "int":$index,
+           "::llvm::SmallVectorImpl<::mlir::Value> &":$predecessorValues),
+      [{}],
+      /*defaultImplementation=*/[{
+        ::llvm::SmallVector<::mlir::RegionBranchPoint> predecessors;
+        $_op.getPredecessors(successor, predecessors);
+        for (auto predecessor : predecessors) {
+          if (predecessor.isParent()) {
+            predecessorValues.push_back($_op.getEntrySuccessorOperands(successor)[index]);
+            continue;
+          }
+          auto terminator = cast<RegionBranchTerminatorOpInterface>(predecessor.getTerminatorPredecessorOrNull());
+          predecessorValues.push_back(terminator.getSuccessorOperands(successor)[index]);
+        }
+      }]>,
+    InterfaceMethod<[{
         Populates `invocationBounds` with the minimum and maximum number of
         times this operation will invoke the attached regions (assuming the
         regions yield normally, i.e. do not abort or invoke an infinite loop).
@@ -298,7 +372,7 @@ def RegionBranchTerminatorOpInterface :
         passing them to the region successor indicated by `point`.
       }],
       "::mlir::MutableOperandRange", "getMutableSuccessorOperands",
-      (ins "::mlir::RegionBranchPoint":$point)
+      (ins "::mlir::RegionSuccessor":$point)
     >,
     InterfaceMethod<[{
         Returns the potential region successors that are branched to after this
@@ -317,7 +391,7 @@ def RegionBranchTerminatorOpInterface :
       /*defaultImplementation=*/[{
         ::mlir::Operation *op = $_op;
         ::llvm::cast<::mlir::RegionBranchOpInterface>(op->getParentOp())
-          .getSuccessorRegions(op->getParentRegion(), regions);
+          .getSuccessorRegions(::llvm::cast<::mlir::RegionBranchTerminatorOpInterface>(op), regions);
       }]
     >,
   ];
@@ -337,8 +411,8 @@ def RegionBranchTerminatorOpInterface :
     // them to the region successor given by `index`.  If `index` is None, this
     // function returns the operands that are passed as a result to the parent
     // operation.
-    ::mlir::OperandRange getSuccessorOperands(::mlir::RegionBranchPoint point) {
-      return getMutableSuccessorOperands(point);
+    ::mlir::OperandRange getSuccessorOperands(::mlir::RegionSuccessor successor) {
+      return getMutableSuccessorOperands(successor);
     }
   }];
 }
@@ -504,7 +578,7 @@ def ReturnLike : TraitList<[
         /*extraOpDeclaration=*/"",
         /*extraOpDefinition=*/[{
           ::mlir::MutableOperandRange $cppClass::getMutableSuccessorOperands(
-            ::mlir::RegionBranchPoint point) {
+            ::mlir::RegionSuccessor successor) {
             return ::mlir::MutableOperandRange(*this);
           }
         }]
diff --git a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
index a84d10d..24cb123 100644
--- a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
+++ b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
@@ -16,19 +16,21 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
 #include <cassert>
 #include <optional>
 #include <utility>
 
 using namespace mlir;
 
+#define DEBUG_TYPE "local-alias-analysis"
+
 //===----------------------------------------------------------------------===//
 // Underlying Address Computation
 //===----------------------------------------------------------------------===//
@@ -42,81 +44,47 @@ static void collectUnderlyingAddressValues(Value value, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output);
 
-/// Given a successor (`region`) of a RegionBranchOpInterface, collect all of
-/// the underlying values being addressed by one of the successor inputs. If the
-/// provided `region` is null, as per `RegionBranchOpInterface` this represents
-/// the parent operation.
-static void collectUnderlyingAddressValues(RegionBranchOpInterface branch,
-                                           Region *region, Value inputValue,
-                                           unsigned inputIndex,
-                                           unsigned maxDepth,
-                                           DenseSet<Value> &visited,
-                                           SmallVectorImpl<Value> &output) {
-  // Given the index of a region of the branch (`predIndex`), or std::nullopt to
-  // represent the parent operation, try to return the index into the outputs of
-  // this region predecessor that correspond to the input values of `region`. If
-  // an index could not be found, std::nullopt is returned instead.
-  auto getOperandIndexIfPred =
-      [&](RegionBranchPoint pred) -> std::optional<unsigned> {
-    SmallVector<RegionSuccessor, 2> successors;
-    branch.getSuccessorRegions(pred, successors);
-    for (RegionSuccessor &successor : successors) {
-      if (successor.getSuccessor() != region)
-        continue;
-      // Check that the successor inputs map to the given input value.
-      ValueRange inputs = successor.getSuccessorInputs();
-      if (inputs.empty()) {
-        output.push_back(inputValue);
-        break;
-      }
-      unsigned firstInputIndex, lastInputIndex;
-      if (region) {
-        firstInputIndex = cast<BlockArgument>(inputs[0]).getArgNumber();
-        lastInputIndex = cast<BlockArgument>(inputs.back()).getArgNumber();
-      } else {
-        firstInputIndex = cast<OpResult>(inputs[0]).getResultNumber();
-        lastInputIndex = cast<OpResult>(inputs.back()).getResultNumber();
-      }
-      if (firstInputIndex > inputIndex || lastInputIndex < inputIndex) {
-        output.push_back(inputValue);
-        break;
-      }
-      return inputIndex - firstInputIndex;
-    }
-    return std::nullopt;
-  };
-
-  // Check branches from the parent operation.
-  auto branchPoint = RegionBranchPoint::parent();
-  if (region)
-    branchPoint = region;
-
-  if (std::optional<unsigned> operandIndex =
-          getOperandIndexIfPred(/*predIndex=*/RegionBranchPoint::parent())) {
-    collectUnderlyingAddressValues(
-        branch.getEntrySuccessorOperands(branchPoint)[*operandIndex], maxDepth,
-        visited, output);
+/// Given a RegionBranchOpInterface operation  (`branch`), a Value`inputValue`
+/// which is an input for the provided successor (`initialSuccessor`), try to
+/// find the possible sources for the value along the control flow edges.
+static void collectUnderlyingAddressValues2(
+    RegionBranchOpInterface branch, RegionSuccessor initialSuccessor,
+    Value inputValue, unsigned inputIndex, unsigned maxDepth,
+    DenseSet<Value> &visited, SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues2: "
+         << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions());
+  LDBG() << " with initialSuccessor " << initialSuccessor;
+  LDBG() << "  inputValue: " << inputValue;
+  LDBG() << "  inputIndex: " << inputIndex;
+  LDBG() << "  maxDepth: " << maxDepth;
+  ValueRange inputs = initialSuccessor.getSuccessorInputs();
+  if (inputs.empty()) {
+    LDBG() << "  input is empty, enqueue value";
+    output.push_back(inputValue);
+    return;
   }
-  // Check branches from each child region.
-  Operation *op = branch.getOperation();
-  for (Region &region : op->getRegions()) {
-    if (std::optional<unsigned> operandIndex = getOperandIndexIfPred(region)) {
-      for (Block &block : region) {
-        // Try to determine possible region-branch successor operands for the
-        // current region.
-        if (auto term = dyn_cast<RegionBranchTerminatorOpInterface>(
-                block.getTerminator())) {
-          collectUnderlyingAddressValues(
-              term.getSuccessorOperands(branchPoint)[*operandIndex], maxDepth,
-              visited, output);
-        } else if (block.getNumSuccessors()) {
-          // Otherwise, if this terminator may exit the region we can't make
-          // any assumptions about which values get passed.
-          output.push_back(inputValue);
-          return;
-        }
-      }
-    }
+  unsigned firstInputIndex, lastInputIndex;
+  if (isa<BlockArgument>(inputs[0])) {
+    firstInputIndex = cast<BlockArgument>(inputs[0]).getArgNumber();
+    lastInputIndex = cast<BlockArgument>(inputs.back()).getArgNumber();
+  } else {
+    firstInputIndex = cast<OpResult>(inputs[0]).getResultNumber();
+    lastInputIndex = cast<OpResult>(inputs.back()).getResultNumber();
+  }
+  if (firstInputIndex > inputIndex || lastInputIndex < inputIndex) {
+    LDBG() << "  !! Input index " << inputIndex << " out of range "
+           << firstInputIndex << " to " << lastInputIndex
+           << ", adding input value to output";
+    output.push_back(inputValue);
+    return;
+  }
+  SmallVector<Value> predecessorValues;
+  branch.getPredecessorValues(initialSuccessor, inputIndex - firstInputIndex,
+                              predecessorValues);
+  LDBG() << "  Found " << predecessorValues.size() << " predecessor values";
+  for (Value predecessorValue : predecessorValues) {
+    LDBG() << "    Processing predecessor value: " << predecessorValue;
+    collectUnderlyingAddressValues(predecessorValue, maxDepth, visited, output);
   }
 }
 
@@ -124,22 +92,28 @@ static void collectUnderlyingAddressValues(RegionBranchOpInterface branch,
 static void collectUnderlyingAddressValues(OpResult result, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues (OpResult): " << result;
+  LDBG() << "  maxDepth: " << maxDepth;
+
   Operation *op = result.getOwner();
 
   // If this is a view, unwrap to the source.
   if (ViewLikeOpInterface view = dyn_cast<ViewLikeOpInterface>(op)) {
     if (result == view.getViewDest()) {
+      LDBG() << "  Unwrapping view to source: " << view.getViewSource();
       return collectUnderlyingAddressValues(view.getViewSource(), maxDepth,
                                             visited, output);
     }
   }
   // Check to see if we can reason about the control flow of this op.
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
-    return collectUnderlyingAddressValues(branch, /*region=*/nullptr, result,
-                                          result.getResultNumber(), maxDepth,
-                                          visited, output);
+    LDBG() << "  Processing region branch operation";
+    return collectUnderlyingAddressValues2(
+        branch, RegionSuccessor(op, op->getResults()), result,
+        result.getResultNumber(), maxDepth, visited, output);
   }
 
+  LDBG() << "  Adding result to output: " << result;
   output.push_back(result);
 }
 
@@ -148,14 +122,23 @@ static void collectUnderlyingAddressValues(OpResult result, unsigned maxDepth,
 static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues (BlockArgument): " << arg;
+  LDBG() << "  maxDepth: " << maxDepth;
+  LDBG() << "  argNumber: " << arg.getArgNumber();
+  LDBG() << "  isEntryBlock: " << arg.getOwner()->isEntryBlock();
+
   Block *block = arg.getOwner();
   unsigned argNumber = arg.getArgNumber();
 
   // Handle the case of a non-entry block.
   if (!block->isEntryBlock()) {
+    LDBG() << "  Processing non-entry block with "
+           << std::distance(block->pred_begin(), block->pred_end())
+           << " predecessors";
     for (auto it = block->pred_begin(), e = block->pred_end(); it != e; ++it) {
       auto branch = dyn_cast<BranchOpInterface>((*it)->getTerminator());
       if (!branch) {
+        LDBG() << "    Cannot analyze control flow, adding argument to output";
         // We can't analyze the control flow, so bail out early.
         output.push_back(arg);
         return;
@@ -165,10 +148,12 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
       unsigned index = it.getSuccessorIndex();
       Value operand = branch.getSuccessorOperands(index)[argNumber];
       if (!operand) {
+        LDBG() << "    No operand found for argument, adding to output";
         // We can't analyze the control flow, so bail out early.
         output.push_back(arg);
         return;
       }
+      LDBG() << "    Processing operand from predecessor: " << operand;
       collectUnderlyingAddressValues(operand, maxDepth, visited, output);
     }
     return;
@@ -178,10 +163,35 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
   Region *region = block->getParent();
   Operation *op = region->getParentOp();
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
-    return collectUnderlyingAddressValues(branch, region, arg, argNumber,
-                                          maxDepth, visited, output);
+    LDBG() << "  Processing region branch operation for entry block";
+    // We have to find the successor matching the region, so that the input
+    // arguments are correctly set.
+    // TODO: this isn't comprehensive: the successor may not be reachable from
+    // the entry block.
+    SmallVector<RegionSuccessor> successors;
+    branch.getSuccessorRegions(RegionBranchPoint::parent(), successors);
+    RegionSuccessor regionSuccessor(region);
+    bool found = false;
+    for (RegionSuccessor &successor : successors) {
+      if (successor.getSuccessor() == region) {
+        LDBG() << "  Found matching region successor: " << successor;
+        found = true;
+        regionSuccessor = successor;
+        break;
+      }
+    }
+    if (!found) {
+      LDBG()
+          << "  No matching region successor found, adding argument to output";
+      output.push_back(arg);
+      return;
+    }
+    return collectUnderlyingAddressValues2(
+        branch, regionSuccessor, arg, argNumber, maxDepth, visited, output);
   }
 
+  LDBG()
+      << "  Cannot reason about underlying address, adding argument to output";
   // We can't reason about the underlying address of this argument.
   output.push_back(arg);
 }
@@ -190,17 +200,26 @@ static void collectUnderlyingAddressValues(BlockArgument arg, unsigned maxDepth,
 static void collectUnderlyingAddressValues(Value value, unsigned maxDepth,
                                            DenseSet<Value> &visited,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues: " << value;
+  LDBG() << "  maxDepth: " << maxDepth;
+
   // Check that we don't infinitely recurse.
-  if (!visited.insert(value).second)
+  if (!visited.insert(value).second) {
+    LDBG() << "  Value already visited, skipping";
     return;
+  }
   if (maxDepth == 0) {
+    LDBG() << "  Max depth reached, adding value to output";
     output.push_back(value);
     return;
   }
   --maxDepth;
 
-  if (BlockArgument arg = dyn_cast<BlockArgument>(value))
+  if (BlockArgument arg = dyn_cast<BlockArgument>(value)) {
+    LDBG() << "  Processing as BlockArgument";
     return collectUnderlyingAddressValues(arg, maxDepth, visited, output);
+  }
+  LDBG() << "  Processing as OpResult";
   collectUnderlyingAddressValues(cast<OpResult>(value), maxDepth, visited,
                                  output);
 }
@@ -208,9 +227,11 @@ static void collectUnderlyingAddressValues(Value value, unsigned maxDepth,
 /// Given a value, collect all of the underlying values being addressed.
 static void collectUnderlyingAddressValues(Value value,
                                            SmallVectorImpl<Value> &output) {
+  LDBG() << "collectUnderlyingAddressValues: " << value;
   DenseSet<Value> visited;
   collectUnderlyingAddressValues(value, maxUnderlyingValueSearchDepth, visited,
                                  output);
+  LDBG() << "  Collected " << output.size() << " underlying values";
 }
 
 //===----------------------------------------------------------------------===//
@@ -227,19 +248,33 @@ static LogicalResult
 getAllocEffectFor(Value value,
                   std::optional<MemoryEffects::EffectInstance> &effect,
                   Operation *&allocScopeOp) {
+  LDBG() << "getAllocEffectFor: " << value;
+
   // Try to get a memory effect interface for the parent operation.
   Operation *op;
-  if (BlockArgument arg = dyn_cast<BlockArgument>(value))
+  if (BlockArgument arg = dyn_cast<BlockArgument>(value)) {
     op = arg.getOwner()->getParentOp();
-  else
+    LDBG() << "  BlockArgument, parent op: "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
+  } else {
     op = cast<OpResult>(value).getOwner();
+    LDBG() << "  OpResult, owner op: "
+           << OpWithFlags(op, OpPrintingFlags().skipRegions());
+  }
+
   MemoryEffectOpInterface interface = dyn_cast<MemoryEffectOpInterface>(op);
-  if (!interface)
+  if (!interface) {
+    LDBG() << "  No memory effect interface found";
     return failure();
+  }
 
   // Try to find an allocation effect on the resource.
-  if (!(effect = interface.getEffectOnValue<MemoryEffects::Allocate>(value)))
+  if (!(effect = interface.getEffectOnValue<MemoryEffects::Allocate>(value))) {
+    LDBG() << "  No allocation effect found on value";
     return failure();
+  }
+
+  LDBG() << "  Found allocation effect";
 
   // If we found an allocation effect, try to find a scope for the allocation.
   // If the resource of this allocation is automatically scoped, find the parent
@@ -247,6 +282,12 @@ getAllocEffectFor(Value value,
   if (llvm::isa<SideEffects::AutomaticAllocationScopeResource>(
           effect->getResource())) {
     allocScopeOp = op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+    if (allocScopeOp) {
+      LDBG() << "  Automatic allocation scope found: "
+             << OpWithFlags(allocScopeOp, OpPrintingFlags().skipRegions());
+    } else {
+      LDBG() << "  Automatic allocation scope found: null";
+    }
     return success();
   }
 
@@ -255,6 +296,12 @@ getAllocEffectFor(Value value,
   // For now assume allocation scope to the function scope (we don't care if
   // pointer escape outside function).
   allocScopeOp = op->getParentOfType<FunctionOpInterface>();
+  if (allocScopeOp) {
+    LDBG() << "  Function scope found: "
+           << OpWithFlags(allocScopeOp, OpPrintingFlags().skipRegions());
+  } else {
+    LDBG() << "  Function scope found: null";
+  }
   return success();
 }
 
@@ -293,33 +340,44 @@ static std::optional<AliasResult> checkDistinctObjects(Value lhs, Value rhs) {
 
 /// Given the two values, return their aliasing behavior.
 AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
-  if (lhs == rhs)
+  LDBG() << "aliasImpl: " << lhs << " vs " << rhs;
+
+  if (lhs == rhs) {
+    LDBG() << "  Same value, must alias";
     return AliasResult::MustAlias;
+  }
+
   Operation *lhsAllocScope = nullptr, *rhsAllocScope = nullptr;
   std::optional<MemoryEffects::EffectInstance> lhsAlloc, rhsAlloc;
 
   // Handle the case where lhs is a constant.
   Attribute lhsAttr, rhsAttr;
   if (matchPattern(lhs, m_Constant(&lhsAttr))) {
+    LDBG() << "  lhs is constant";
     // TODO: This is overly conservative. Two matching constants don't
     // necessarily map to the same address. For example, if the two values
     // correspond to different symbols that both represent a definition.
-    if (matchPattern(rhs, m_Constant(&rhsAttr)))
+    if (matchPattern(rhs, m_Constant(&rhsAttr))) {
+      LDBG() << "  rhs is also constant, may alias";
       return AliasResult::MayAlias;
+    }
 
     // Try to find an alloc effect on rhs. If an effect was found we can't
     // alias, otherwise we might.
-    return succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope))
-               ? AliasResult::NoAlias
-               : AliasResult::MayAlias;
+    bool rhsHasAlloc =
+        succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope));
+    LDBG() << "  rhs has alloc effect: " << rhsHasAlloc;
+    return rhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias;
   }
   // Handle the case where rhs is a constant.
   if (matchPattern(rhs, m_Constant(&rhsAttr))) {
+    LDBG() << "  rhs is constant";
     // Try to find an alloc effect on lhs. If an effect was found we can't
     // alias, otherwise we might.
-    return succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope))
-               ? AliasResult::NoAlias
-               : AliasResult::MayAlias;
+    bool lhsHasAlloc =
+        succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope));
+    LDBG() << "  lhs has alloc effect: " << lhsHasAlloc;
+    return lhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias;
   }
 
   if (std::optional<AliasResult> result = checkDistinctObjects(lhs, rhs))
@@ -329,9 +387,14 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
   // an allocation effect.
   bool lhsHasAlloc = succeeded(getAllocEffectFor(lhs, lhsAlloc, lhsAllocScope));
   bool rhsHasAlloc = succeeded(getAllocEffectFor(rhs, rhsAlloc, rhsAllocScope));
+  LDBG() << "  lhs has alloc effect: " << lhsHasAlloc;
+  LDBG() << "  rhs has alloc effect: " << rhsHasAlloc;
+
   if (lhsHasAlloc == rhsHasAlloc) {
     // If both values have an allocation effect we know they don't alias, and if
     // neither have an effect we can't make an assumptions.
+    LDBG() << "  Both have same alloc status: "
+           << (lhsHasAlloc ? "NoAlias" : "MayAlias");
     return lhsHasAlloc ? AliasResult::NoAlias : AliasResult::MayAlias;
   }
 
@@ -339,6 +402,7 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
   // and one without. Move the one with the effect to the lhs to make the next
   // checks simpler.
   if (rhsHasAlloc) {
+    LDBG() << "  Swapping lhs and rhs to put alloc effect on lhs";
     std::swap(lhs, rhs);
     lhsAlloc = rhsAlloc;
     lhsAllocScope = rhsAllocScope;
@@ -347,49 +411,74 @@ AliasResult LocalAliasAnalysis::aliasImpl(Value lhs, Value rhs) {
   // If the effect has a scoped allocation region, check to see if the
   // non-effect value is defined above that scope.
   if (lhsAllocScope) {
+    LDBG() << "  Checking allocation scope: "
+           << OpWithFlags(lhsAllocScope, OpPrintingFlags().skipRegions());
     // If the parent operation of rhs is an ancestor of the allocation scope, or
     // if rhs is an entry block argument of the allocation scope we know the two
     // values can't alias.
     Operation *rhsParentOp = rhs.getParentRegion()->getParentOp();
-    if (rhsParentOp->isProperAncestor(lhsAllocScope))
+    if (rhsParentOp->isProperAncestor(lhsAllocScope)) {
+      LDBG() << "  rhs parent is ancestor of alloc scope, no alias";
       return AliasResult::NoAlias;
+    }
     if (rhsParentOp == lhsAllocScope) {
       BlockArgument rhsArg = dyn_cast<BlockArgument>(rhs);
-      if (rhsArg && rhs.getParentBlock()->isEntryBlock())
+      if (rhsArg && rhs.getParentBlock()->isEntryBlock()) {
+        LDBG() << "  rhs is entry block arg of alloc scope, no alias";
         return AliasResult::NoAlias;
+      }
     }
   }
 
   // If we couldn't reason about the relationship between the two values,
   // conservatively assume they might alias.
+  LDBG() << "  Cannot reason about relationship, may alias";
   return AliasResult::MayAlias;
 }
 
 /// Given the two values, return their aliasing behavior.
 AliasResult LocalAliasAnalysis::alias(Value lhs, Value rhs) {
-  if (lhs == rhs)
+  LDBG() << "alias: " << lhs << " vs " << rhs;
+
+  if (lhs == rhs) {
+    LDBG() << "  Same value, must alias";
     return AliasResult::MustAlias;
+  }
 
   // Get the underlying values being addressed.
   SmallVector<Value, 8> lhsValues, rhsValues;
   collectUnderlyingAddressValues(lhs, lhsValues);
   collectUnderlyingAddressValues(rhs, rhsValues);
 
+  LDBG() << "  lhs underlying values: " << lhsValues.size();
+  LDBG() << "  rhs underlying values: " << rhsValues.size();
+
   // If we failed to collect for either of the values somehow, conservatively
   // assume they may alias.
-  if (lhsValues.empty() || rhsValues.empty())
+  if (lhsValues.empty() || rhsValues.empty()) {
+    LDBG() << "  Failed to collect underlying values, may alias";
     return AliasResult::MayAlias;
+  }
 
   // Check the alias results against each of the underlying values.
   std::optional<AliasResult> result;
   for (Value lhsVal : lhsValues) {
     for (Value rhsVal : rhsValues) {
+      LDBG() << "  Checking underlying values: " << lhsVal << " vs " << rhsVal;
       AliasResult nextResult = aliasImpl(lhsVal, rhsVal);
+      LDBG() << "  Result: "
+             << (nextResult == AliasResult::MustAlias ? "MustAlias"
+                 : nextResult == AliasResult::NoAlias ? "NoAlias"
+                                                      : "MayAlias");
       result = result ? result->merge(nextResult) : nextResult;
     }
   }
 
   // We should always have a valid result here.
+  LDBG() << "  Final result: "
+         << (result->isMust() ? "MustAlias"
+             : result->isNo() ? "NoAlias"
+                              : "MayAlias");
   return *result;
 }
 
@@ -398,8 +487,12 @@ AliasResult LocalAliasAnalysis::alias(Value lhs, Value rhs) {
 //===----------------------------------------------------------------------===//
 
 ModRefResult LocalAliasAnalysis::getModRef(Operation *op, Value location) {
+  LDBG() << "getModRef: " << OpWithFlags(op, OpPrintingFlags().skipRegions())
+         << " on location " << location;
+
   // Check to see if this operation relies on nested side effects.
   if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) {
+    LDBG() << "  Operation has recursive memory effects, returning ModAndRef";
     // TODO: To check recursive operations we need to check all of the nested
     // operations, which can result in a quadratic number of queries. We should
     // introduce some caching of some kind to help alleviate this, especially as
@@ -410,38 +503,64 @@ ModRefResult LocalAliasAnalysis::getModRef(Operation *op, Value location) {
 
   // Otherwise, check to see if this operation has a memory effect interface.
   MemoryEffectOpInterface interface = dyn_cast<MemoryEffectOpInterface>(op);
-  if (!interface)
+  if (!interface) {
+    LDBG() << "  No memory effect interface, returning ModAndRef";
     return ModRefResult::getModAndRef();
+  }
 
   // Build a ModRefResult by merging the behavior of the effects of this
   // operation.
   SmallVector<MemoryEffects::EffectInstance> effects;
   interface.getEffects(effects);
+  LDBG() << "  Found " << effects.size() << " memory effects";
 
   ModRefResult result = ModRefResult::getNoModRef();
   for (const MemoryEffects::EffectInstance &effect : effects) {
-    if (isa<MemoryEffects::Allocate, MemoryEffects::Free>(effect.getEffect()))
+    if (isa<MemoryEffects::Allocate, MemoryEffects::Free>(effect.getEffect())) {
+      LDBG() << "    Skipping alloc/free effect";
       continue;
+    }
 
     // Check for an alias between the effect and our memory location.
     // TODO: Add support for checking an alias with a symbol reference.
     AliasResult aliasResult = AliasResult::MayAlias;
-    if (Value effectValue = effect.getValue())
+    if (Value effectValue = effect.getValue()) {
+      LDBG() << "    Checking alias between effect value " << effectValue
+             << " and location " << location;
       aliasResult = alias(effectValue, location);
+      LDBG() << "    Alias result: "
+             << (aliasResult.isMust() ? "MustAlias"
+                 : aliasResult.isNo() ? "NoAlias"
+                                      : "MayAlias");
+    } else {
+      LDBG() << "    No effect value, assuming MayAlias";
+    }
 
     // If we don't alias, ignore this effect.
-    if (aliasResult.isNo())
+    if (aliasResult.isNo()) {
+      LDBG() << "    No alias, ignoring effect";
       continue;
+    }
 
     // Merge in the corresponding mod or ref for this effect.
     if (isa<MemoryEffects::Read>(effect.getEffect())) {
+      LDBG() << "    Adding Ref to result";
       result = result.merge(ModRefResult::getRef());
     } else {
       assert(isa<MemoryEffects::Write>(effect.getEffect()));
+      LDBG() << "    Adding Mod to result";
       result = result.merge(ModRefResult::getMod());
     }
-    if (result.isModAndRef())
+    if (result.isModAndRef()) {
+      LDBG() << "    Result is now ModAndRef, breaking";
       break;
+    }
   }
+
+  LDBG() << "  Final ModRef result: "
+         << (result.isModAndRef() ? "ModAndRef"
+             : result.isMod()     ? "Mod"
+             : result.isRef()     ? "Ref"
+                                  : "NoModRef");
   return result;
 }
diff --git a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
index 377f7eb..0fc5b44 100644
--- a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
@@ -501,11 +501,10 @@ void DeadCodeAnalysis::visitRegionTerminator(Operation *op,
     return;
 
   SmallVector<RegionSuccessor> successors;
-  if (auto terminator = dyn_cast<RegionBranchTerminatorOpInterface>(op))
-    terminator.getSuccessorRegions(*operands, successors);
-  else
-    branch.getSuccessorRegions(op->getParentRegion(), successors);
-
+  auto terminator = dyn_cast<RegionBranchTerminatorOpInterface>(op);
+  if (!terminator)
+    return;
+  terminator.getSuccessorRegions(*operands, successors);
   visitRegionBranchEdges(branch, op, successors);
 }
 
diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index daa3db5..0682e5f 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -588,7 +588,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) {
     // flow, propagate the lattice back along the control flow edge.
     if (auto branch = dyn_cast<RegionBranchOpInterface>(block->getParentOp())) {
       LDBG() << "    Exit block of region branch operation";
-      visitRegionBranchOperation(point, branch, block->getParent(), before);
+      auto terminator =
+          cast<RegionBranchTerminatorOpInterface>(block->getTerminator());
+      visitRegionBranchOperation(point, branch, terminator, before);
       return;
     }
 
diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
index 0d2e2ed..8e63ae8 100644
--- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
@@ -130,7 +130,7 @@ AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) {
   // The results of a region branch operation are determined by control-flow.
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
     visitRegionSuccessors(getProgramPointAfter(branch), branch,
-                          /*successor=*/RegionBranchPoint::parent(),
+                          /*successor=*/{branch, branch->getResults()},
                           resultLattices);
     return success();
   }
@@ -279,7 +279,7 @@ void AbstractSparseForwardDataFlowAnalysis::visitCallableOperation(
 
 void AbstractSparseForwardDataFlowAnalysis::visitRegionSuccessors(
     ProgramPoint *point, RegionBranchOpInterface branch,
-    RegionBranchPoint successor, ArrayRef<AbstractSparseLattice *> lattices) {
+    RegionSuccessor successor, ArrayRef<AbstractSparseLattice *> lattices) {
   const auto *predecessors = getOrCreateFor<PredecessorState>(point, point);
   assert(predecessors->allPredecessorsKnown() &&
          "unexpected unresolved region successors");
@@ -314,7 +314,7 @@ void AbstractSparseForwardDataFlowAnalysis::visitRegionSuccessors(
         visitNonControlFlowArgumentsImpl(
             branch,
             RegionSuccessor(
-                branch->getResults().slice(firstIndex, inputs.size())),
+                branch, branch->getResults().slice(firstIndex, inputs.size())),
             lattices, firstIndex);
       } else {
         if (!inputs.empty())
diff --git a/mlir/lib/Analysis/SliceWalk.cpp b/mlir/lib/Analysis/SliceWalk.cpp
index 817d71a..863f260 100644
--- a/mlir/lib/Analysis/SliceWalk.cpp
+++ b/mlir/lib/Analysis/SliceWalk.cpp
@@ -114,7 +114,7 @@ mlir::getControlFlowPredecessors(Value value) {
     if (!regionOp)
       return std::nullopt;
     // Add the control flow predecessor operands to the work list.
-    RegionSuccessor region(regionOp->getResults());
+    RegionSuccessor region(regionOp, regionOp->getResults());
     SmallVector<Value> predecessorOperands = getRegionPredecessorOperands(
         regionOp, region, opResult.getResultNumber());
     return predecessorOperands;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 478b6aa..1eca43d 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -989,21 +989,17 @@ mfmaOpToScaledIntrinsic(ScaledMFMAOp smfma, Chipset chipset) {
                                  smfma.getN(), smfma.getK(), 1u, chipset);
 }
 
-/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
-/// if one exists. This includes checking to ensure the intrinsic is supported
-/// on the architecture you are compiling for.
-static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
-                                                  Chipset chipset) {
-  auto sourceVectorType = cast<VectorType>(wmma.getSourceA().getType());
-  auto sourceBVectorType = cast<VectorType>(wmma.getSourceB().getType());
-  auto destVectorType = cast<VectorType>(wmma.getDestC().getType());
-  Type elemSourceType = sourceVectorType.getElementType();
-  Type elemBSourceType = sourceBVectorType.getElementType();
-  Type elemDestType = destVectorType.getElementType();
-
-  const uint32_t k = wmma.getK();
-
+/// Returns the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
+/// for RDNA3/4 architectures.
+static std::optional<StringRef>
+wmmaOpToIntrinsicRDNA(Type elemSourceType, Type elemBSourceType,
+                      Type elemDestType, uint32_t k, bool isRDNA3) {
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+
+  // Handle k == 16 for RDNA3/4.
   if (k == 16) {
+    // Common patterns for RDNA3 and RDNA4.
     if (elemSourceType.isF16() && elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_f16::getOperationName();
     if (elemSourceType.isBF16() && elemDestType.isF32())
@@ -1014,39 +1010,160 @@ static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
       return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName();
     if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x16_iu8::getOperationName();
-    if (chipset.majorVersion == 11) {
+
+    // RDNA3 specific patterns.
+    if (isRDNA3) {
       if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
         return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
+      return std::nullopt;
     }
-  }
-  if (chipset.majorVersion < 12)
-    return std::nullopt;
 
-  // gfx12+
-  if (k == 16) {
-    if (isa<Float8E4M3FNType>(elemSourceType) &&
-        isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
+    // RDNA4 specific patterns (fp8/bf8).
+    if (isa<fp8>(elemSourceType) && isa<fp8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_fp8_fp8::getOperationName();
-    if (isa<Float8E4M3FNType>(elemSourceType) &&
-        isa<Float8E5M2Type>(elemBSourceType) && elemDestType.isF32())
+    if (isa<fp8>(elemSourceType) && isa<bf8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_fp8_bf8::getOperationName();
-    if (isa<Float8E5M2Type>(elemSourceType) &&
-        isa<Float8E5M2Type>(elemBSourceType) && elemDestType.isF32())
+    if (isa<bf8>(elemSourceType) && isa<bf8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_bf8_bf8::getOperationName();
-    if (isa<Float8E5M2Type>(elemSourceType) &&
-        isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
+    if (isa<bf8>(elemSourceType) && isa<fp8>(elemBSourceType) &&
+        elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_bf8_fp8::getOperationName();
     if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
 
     return std::nullopt;
   }
-  if (k == 32) {
+
+  // Handle k == 32 for RDNA4.
+  if (k == 32 && !isRDNA3) {
     if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x32_iu4::getOperationName();
+  }
+
+  llvm_unreachable("Unsupported k value");
+}
+
+/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
+/// for the gfx1250 architecture.
+static std::optional<StringRef> wmmaOpToIntrinsicGfx1250(Type elemSourceType,
+                                                         Type elemBSourceType,
+                                                         Type elemDestType,
+                                                         uint32_t k) {
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+
+  if (k == 4) {
+    if (elemSourceType.isF32() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x4_f32::getOperationName();
+
     return std::nullopt;
   }
 
+  if (k == 32) {
+    if (elemSourceType.isF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x32_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x32_bf16::getOperationName();
+    if (elemSourceType.isF16() && elemDestType.isF16())
+      return ROCDL::wmma_f16_16x16x32_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isBF16())
+      return ROCDL::wmma_bf16_16x16x32_bf16::getOperationName();
+
+    return std::nullopt;
+  }
+
+  if (k == 64) {
+    if (isa<fp8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_fp8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_fp8_fp8::getOperationName();
+    }
+    if (isa<fp8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_fp8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_fp8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_bf8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_bf8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x64_bf8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x64_bf8_fp8::getOperationName();
+    }
+    if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
+      return ROCDL::wmma_i32_16x16x64_iu8::getOperationName();
+
+    return std::nullopt;
+  }
+
+  if (k == 128) {
+    if (isa<fp8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_fp8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_fp8_fp8::getOperationName();
+    }
+    if (isa<fp8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_fp8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_fp8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<bf8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_bf8_bf8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_bf8_bf8::getOperationName();
+    }
+    if (isa<bf8>(elemSourceType) && isa<fp8>(elemBSourceType)) {
+      if (elemDestType.isF32())
+        return ROCDL::wmma_f32_16x16x128_bf8_fp8::getOperationName();
+      if (elemDestType.isF16())
+        return ROCDL::wmma_f16_16x16x128_bf8_fp8::getOperationName();
+    }
+
+    return std::nullopt;
+  }
+
+  llvm_unreachable("Unsupported k value");
+}
+
+/// Returns the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
+/// if one exists. This includes checking to ensure the intrinsic is supported
+/// on the architecture you are compiling for.
+static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
+                                                  Chipset chipset) {
+  auto sourceVectorType = cast<VectorType>(wmma.getSourceA().getType());
+  auto sourceBVectorType = cast<VectorType>(wmma.getSourceB().getType());
+  auto destVectorType = cast<VectorType>(wmma.getDestC().getType());
+  Type elemSourceType = sourceVectorType.getElementType();
+  Type elemBSourceType = sourceBVectorType.getElementType();
+  Type elemDestType = destVectorType.getElementType();
+
+  const uint32_t k = wmma.getK();
+  const bool isRDNA3 = chipset.majorVersion == 11;
+  const bool isRDNA4 = chipset.majorVersion == 12 && chipset.minorVersion == 0;
+
+  // Handle RDNA3 and RDNA4.
+  if (isRDNA3 || isRDNA4)
+    return wmmaOpToIntrinsicRDNA(elemSourceType, elemBSourceType, elemDestType,
+                                 k, isRDNA3);
+
+  // Handle gfx1250.
+  if (chipset == Chipset{12, 5, 0})
+    return wmmaOpToIntrinsicGfx1250(elemSourceType, elemBSourceType,
+                                    elemDestType, k);
+
   llvm_unreachable("unhandled WMMA case");
 }
 
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 0fe7239..9e46b7d 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -313,25 +313,53 @@ private:
 struct ExpOpConversion : public OpConversionPattern<complex::ExpOp> {
   using OpConversionPattern<complex::ExpOp>::OpConversionPattern;
 
+  // exp(x+I*y) = exp(x)*(cos(y)+I*sin(y))
+  // Handle special cases as StableHLO implementation does:
+  // 1. When b == 0, set imag(exp(z)) = 0
+  // 2. When exp(x) == inf, use exp(x/2)*(cos(y)+I*sin(y))*exp(x/2)
   LogicalResult
   matchAndRewrite(complex::ExpOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     auto type = cast<ComplexType>(adaptor.getComplex().getType());
-    auto elementType = cast<FloatType>(type.getElementType());
-    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
-
-    Value real =
-        complex::ReOp::create(rewriter, loc, elementType, adaptor.getComplex());
-    Value imag =
-        complex::ImOp::create(rewriter, loc, elementType, adaptor.getComplex());
-    Value expReal = math::ExpOp::create(rewriter, loc, real, fmf.getValue());
-    Value cosImag = math::CosOp::create(rewriter, loc, imag, fmf.getValue());
+    auto ET = cast<FloatType>(type.getElementType());
+    arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
+    const auto &floatSemantics = ET.getFloatSemantics();
+    ImplicitLocOpBuilder b(loc, rewriter);
+
+    Value x = complex::ReOp::create(b, ET, adaptor.getComplex());
+    Value y = complex::ImOp::create(b, ET, adaptor.getComplex());
+    Value zero = arith::ConstantOp::create(b, ET, b.getZeroAttr(ET));
+    Value half = arith::ConstantOp::create(b, ET, b.getFloatAttr(ET, 0.5));
+    Value inf = arith::ConstantOp::create(
+        b, ET, b.getFloatAttr(ET, APFloat::getInf(floatSemantics)));
+
+    Value exp = math::ExpOp::create(b, x, fmf);
+    Value xHalf = arith::MulFOp::create(b, x, half, fmf);
+    Value expHalf = math::ExpOp::create(b, xHalf, fmf);
+    Value cos = math::CosOp::create(b, y, fmf);
+    Value sin = math::SinOp::create(b, y, fmf);
+
+    Value expIsInf =
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, exp, inf, fmf);
+    Value yIsZero =
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, y, zero);
+
+    // Real path: select between exp(x)*cos(y) and exp(x/2)*cos(y)*exp(x/2)
+    Value realNormal = arith::MulFOp::create(b, exp, cos, fmf);
+    Value expHalfCos = arith::MulFOp::create(b, expHalf, cos, fmf);
+    Value realOverflow = arith::MulFOp::create(b, expHalfCos, expHalf, fmf);
     Value resultReal =
-        arith::MulFOp::create(rewriter, loc, expReal, cosImag, fmf.getValue());
-    Value sinImag = math::SinOp::create(rewriter, loc, imag, fmf.getValue());
-    Value resultImag =
-        arith::MulFOp::create(rewriter, loc, expReal, sinImag, fmf.getValue());
+        arith::SelectOp::create(b, expIsInf, realOverflow, realNormal);
+
+    // Imaginary part: if y == 0 return 0 else select between exp(x)*sin(y) and
+    // exp(x/2)*sin(y)*exp(x/2)
+    Value imagNormal = arith::MulFOp::create(b, exp, sin, fmf);
+    Value expHalfSin = arith::MulFOp::create(b, expHalf, sin, fmf);
+    Value imagOverflow = arith::MulFOp::create(b, expHalfSin, expHalf, fmf);
+    Value imagNonZero =
+        arith::SelectOp::create(b, expIsInf, imagOverflow, imagNormal);
+    Value resultImag = arith::SelectOp::create(b, yIsZero, zero, imagNonZero);
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 585b6da..df955fc 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -399,13 +399,15 @@ LogicalResult WMMAOp::verify() {
 
   if (!sourceAElemType.isFloat(8) && sourceAElemType != sourceBElemType) {
     return emitOpError(
-               "source element types much match (except for fp8) but have ")
+               "source element types must match (except for fp8/bf8) but have ")
            << sourceAType << " and " << sourceBType;
   }
 
-  if (!sourceAElemType.isInteger(4) && getK() != 16) {
-    return emitOpError("K dimension must be 16 for source element type ")
-           << sourceAElemType;
+  if (isSrcFloat) {
+    if (getClamp())
+      return emitOpError("clamp flag is not supported for float types");
+    if (getUnsignedA() || getUnsignedB())
+      return emitOpError("unsigned flags are not supported for float types");
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index e0a53cd..0c35921 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -2716,8 +2716,9 @@ LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
   return success(folded);
 }
 
-OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert((point.isParent() || point == getRegion()) && "invalid region point");
+OperandRange AffineForOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert((successor.isParent() || successor.getSuccessor() == &getRegion()) &&
+         "invalid region point");
 
   // The initial operands map to the loop arguments after the induction
   // variable or are forwarded to the results when the trip count is zero.
@@ -2726,34 +2727,41 @@ OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 
 void AffineForOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
-  assert((point.isParent() || point == getRegion()) && "expected loop region");
+  assert((point.isParent() ||
+          point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+              &getRegion()) &&
+         "expected loop region");
   // The loop may typically branch back to its body or to the parent operation.
   // If the predecessor is the parent op and the trip count is known to be at
   // least one, branch into the body using the iterator arguments. And in cases
   // we know the trip count is zero, it can only branch back to its parent.
   std::optional<uint64_t> tripCount = getTrivialConstantTripCount(*this);
-  if (point.isParent() && tripCount.has_value()) {
-    if (tripCount.value() > 0) {
-      regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
-      return;
-    }
-    if (tripCount.value() == 0) {
-      regions.push_back(RegionSuccessor(getResults()));
-      return;
+  if (tripCount.has_value()) {
+    if (!point.isParent()) {
+      // From the loop body, if the trip count is one, we can only branch back
+      // to the parent.
+      if (tripCount == 1) {
+        regions.push_back(RegionSuccessor(getOperation(), getResults()));
+        return;
+      }
+      if (tripCount == 0)
+        return;
+    } else {
+      if (tripCount.value() > 0) {
+        regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
+        return;
+      }
+      if (tripCount.value() == 0) {
+        regions.push_back(RegionSuccessor(getOperation(), getResults()));
+        return;
+      }
     }
   }
 
-  // From the loop body, if the trip count is one, we can only branch back to
-  // the parent.
-  if (!point.isParent() && tripCount == 1) {
-    regions.push_back(RegionSuccessor(getResults()));
-    return;
-  }
-
   // In all other cases, the loop may branch back to itself or the parent
   // operation.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 AffineBound AffineForOp::getLowerBound() {
@@ -3142,7 +3150,7 @@ void AffineIfOp::getSuccessorRegions(
         RegionSuccessor(&getThenRegion(), getThenRegion().getArguments()));
     // If the "else" region is empty, branch bach into parent.
     if (getElseRegion().empty()) {
-      regions.push_back(getResults());
+      regions.push_back(RegionSuccessor(getOperation(), getResults()));
     } else {
       regions.push_back(
           RegionSuccessor(&getElseRegion(), getElseRegion().getArguments()));
@@ -3152,7 +3160,7 @@ void AffineIfOp::getSuccessorRegions(
 
   // If the predecessor is the `else`/`then` region, then branching into parent
   // op is valid.
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 LogicalResult AffineIfOp::verify() {
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index dc7b07d..8e4a49d 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -36,8 +36,9 @@ void AsyncDialect::initialize() {
 
 constexpr char kOperandSegmentSizesAttr[] = "operandSegmentSizes";
 
-OperandRange ExecuteOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBodyRegion() && "invalid region index");
+OperandRange ExecuteOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBodyRegion() &&
+         "invalid region index");
   return getBodyOperands();
 }
 
@@ -53,8 +54,10 @@ bool ExecuteOp::areTypesCompatible(Type lhs, Type rhs) {
 void ExecuteOp::getSuccessorRegions(RegionBranchPoint point,
                                     SmallVectorImpl<RegionSuccessor> &regions) {
   // The `body` region branch back to the parent operation.
-  if (point == getBodyRegion()) {
-    regions.push_back(RegionSuccessor(getBodyResults()));
+  if (!point.isParent() &&
+      point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+          &getBodyRegion()) {
+    regions.push_back(RegionSuccessor(getOperation(), getBodyResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
index b593cca..36a759c 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
@@ -562,8 +562,11 @@ LogicalResult
 BufferDeallocation::updateFunctionSignature(FunctionOpInterface op) {
   SmallVector<TypeRange> returnOperandTypes(llvm::map_range(
       op.getFunctionBody().getOps<RegionBranchTerminatorOpInterface>(),
-      [](RegionBranchTerminatorOpInterface op) {
-        return op.getSuccessorOperands(RegionBranchPoint::parent()).getTypes();
+      [&](RegionBranchTerminatorOpInterface branchOp) {
+        return branchOp
+            .getSuccessorOperands(RegionSuccessor(
+                op.getOperation(), op.getOperation()->getResults()))
+            .getTypes();
       }));
   if (!llvm::all_equal(returnOperandTypes))
     return op->emitError(
@@ -942,8 +945,8 @@ BufferDeallocation::handleInterface(RegionBranchTerminatorOpInterface op) {
   // about, but we would need to check how many successors there are and under
   // which condition they are taken, etc.
 
-  MutableOperandRange operands =
-      op.getMutableSuccessorOperands(RegionBranchPoint::parent());
+  MutableOperandRange operands = op.getMutableSuccessorOperands(
+      RegionSuccessor(op.getOperation(), op.getOperation()->getResults()));
 
   SmallVector<Value> updatedOwnerships;
   auto result = deallocation_impl::insertDeallocOpForReturnLike(
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 4754f0b..0992ce14 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -845,7 +845,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
                                SmallVectorImpl<RegionSuccessor> &regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
     return;
   }
 
@@ -854,7 +855,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
   // Don't consider the else region if it is empty.
   Region *elseRegion = &this->getElseRegion();
   if (elseRegion->empty())
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
   else
     regions.push_back(RegionSuccessor(elseRegion));
 }
@@ -871,7 +873,7 @@ void IfOp::getEntrySuccessorRegions(ArrayRef<Attribute> operands,
     if (!getElseRegion().empty())
       regions.emplace_back(&getElseRegion());
     else
-      regions.emplace_back();
+      regions.emplace_back(getOperation(), getOperation()->getResults());
   }
 }
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index b5f8dda..6c6d8d2 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2399,7 +2399,7 @@ ParseResult WarpExecuteOnLane0Op::parse(OpAsmParser &parser,
 void WarpExecuteOnLane0Op::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index eb2d825..bd25e94 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -495,13 +495,14 @@ FailureOr<PackResult> linalg::pack(RewriterBase &rewriter,
     if (failed(maybePackedDimForEachOperand))
       return failure();
     packedOperandsDims.packedDimForEachOperand = *maybePackedDimForEachOperand;
-    listOfPackedOperandsDim.pushBack(std::move(packedOperandsDims));
 
     LDBG() << "++++ After pack size #" << i << ": " << packedSizes[i];
     LDBG() << "maps: " << llvm::interleaved(indexingMaps);
     LDBG() << "iterators: " << llvm::interleaved(iteratorTypes);
     LDBG() << "packedDimForEachOperand: "
            << llvm::interleaved(packedOperandsDims.packedDimForEachOperand);
+
+    listOfPackedOperandsDim.pushBack(std::move(packedOperandsDims));
   }
 
   // Step 2. Propagate packing to all LinalgOp operands.
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index c551fba..1c21a2f 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -405,7 +405,7 @@ ParseResult AllocaScopeOp::parse(OpAsmParser &parser, OperationState &result) {
 void AllocaScopeOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
index 6fa8ce4..69afbca 100644
--- a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -98,6 +98,27 @@ struct RankOpInterface
   }
 };
 
+struct CollapseShapeOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<CollapseShapeOpInterface,
+                                                   memref::CollapseShapeOp> {
+  void populateBoundsForShapedValueDim(Operation *op, Value value, int64_t dim,
+                                       ValueBoundsConstraintSet &cstr) const {
+    auto collapseOp = cast<memref::CollapseShapeOp>(op);
+    assert(value == collapseOp.getResult() && "invalid value");
+
+    // Multiply the expressions for the dimensions in the reassociation group.
+    const ReassociationIndices reassocIndices =
+        collapseOp.getReassociationIndices()[dim];
+    AffineExpr productExpr =
+        cstr.getExpr(collapseOp.getSrc(), reassocIndices[0]);
+    for (size_t i = 1; i < reassocIndices.size(); ++i) {
+      productExpr =
+          productExpr * cstr.getExpr(collapseOp.getSrc(), reassocIndices[i]);
+    }
+    cstr.bound(value)[dim] == productExpr;
+  }
+};
+
 struct SubViewOpInterface
     : public ValueBoundsOpInterface::ExternalModel<SubViewOpInterface,
                                                    SubViewOp> {
@@ -134,6 +155,8 @@ void mlir::memref::registerValueBoundsOpInterfaceExternalModels(
         memref::AllocOpInterface<memref::AllocaOp>>(*ctx);
     memref::CastOp::attachInterface<memref::CastOpInterface>(*ctx);
     memref::DimOp::attachInterface<memref::DimOpInterface>(*ctx);
+    memref::CollapseShapeOp::attachInterface<memref::CollapseShapeOpInterface>(
+        *ctx);
     memref::ExpandShapeOp::attachInterface<memref::ExpandShapeOpInterface>(
         *ctx);
     memref::GetGlobalOp::attachInterface<memref::GetGlobalOpInterface>(*ctx);
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 1ab01d8..2946b53 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -397,7 +397,7 @@ void ExecuteRegionOp::getSuccessorRegions(
   }
 
   // Otherwise, the region branches back to the parent operation.
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 //===----------------------------------------------------------------------===//
@@ -405,10 +405,11 @@ void ExecuteRegionOp::getSuccessorRegions(
 //===----------------------------------------------------------------------===//
 
 MutableOperandRange
-ConditionOp::getMutableSuccessorOperands(RegionBranchPoint point) {
-  assert((point.isParent() || point == getParentOp().getAfter()) &&
-         "condition op can only exit the loop or branch to the after"
-         "region");
+ConditionOp::getMutableSuccessorOperands(RegionSuccessor point) {
+  assert(
+      (point.isParent() || point.getSuccessor() == &getParentOp().getAfter()) &&
+      "condition op can only exit the loop or branch to the after"
+      "region");
   // Pass all operands except the condition to the successor region.
   return getArgsMutable();
 }
@@ -426,7 +427,7 @@ void ConditionOp::getSuccessorRegions(
     regions.emplace_back(&whileOp.getAfter(),
                          whileOp.getAfter().getArguments());
   if (!boolAttr || !boolAttr.getValue())
-    regions.emplace_back(whileOp.getResults());
+    regions.emplace_back(whileOp.getOperation(), whileOp.getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -749,7 +750,7 @@ ForOp mlir::scf::getForInductionVarOwner(Value val) {
   return dyn_cast_or_null<ForOp>(containingOp);
 }
 
-OperandRange ForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+OperandRange ForOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   return getInitArgs();
 }
 
@@ -759,7 +760,7 @@ void ForOp::getSuccessorRegions(RegionBranchPoint point,
   // back into the operation itself. It is possible for loop not to enter the
   // body.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 SmallVector<Region *> ForallOp::getLoopRegions() { return {&getRegion()}; }
@@ -2053,9 +2054,10 @@ void ForallOp::getSuccessorRegions(RegionBranchPoint point,
   // parallel by multiple threads. We should not expect to branch back into
   // the forall body after the region's execution is complete.
   if (point.isParent())
-    regions.push_back(RegionSuccessor(&getRegion()));
+    regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
   else
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
 }
 
 //===----------------------------------------------------------------------===//
@@ -2333,9 +2335,10 @@ void IfOp::print(OpAsmPrinter &p) {
 
 void IfOp::getSuccessorRegions(RegionBranchPoint point,
                                SmallVectorImpl<RegionSuccessor> &regions) {
-  // The `then` and the `else` region branch back to the parent operation.
+  // The `then` and the `else` region branch back to the parent operation or one
+  // of the recursive parent operations (early exit case).
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -2344,7 +2347,8 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
   // Don't consider the else region if it is empty.
   Region *elseRegion = &this->getElseRegion();
   if (elseRegion->empty())
-    regions.push_back(RegionSuccessor());
+    regions.push_back(
+        RegionSuccessor(getOperation(), getOperation()->getResults()));
   else
     regions.push_back(RegionSuccessor(elseRegion));
 }
@@ -2361,7 +2365,7 @@ void IfOp::getEntrySuccessorRegions(ArrayRef<Attribute> operands,
     if (!getElseRegion().empty())
       regions.emplace_back(&getElseRegion());
     else
-      regions.emplace_back(getResults());
+      regions.emplace_back(getOperation(), getResults());
   }
 }
 
@@ -3385,7 +3389,8 @@ void ParallelOp::getSuccessorRegions(
   // back into the operation itself. It is possible for loop not to enter the
   // body.
   regions.push_back(RegionSuccessor(&getRegion()));
-  regions.push_back(RegionSuccessor());
+  regions.push_back(RegionSuccessor(
+      getOperation(), ResultRange{getResults().end(), getResults().end()}));
 }
 
 //===----------------------------------------------------------------------===//
@@ -3431,7 +3436,7 @@ LogicalResult ReduceOp::verifyRegions() {
 }
 
 MutableOperandRange
-ReduceOp::getMutableSuccessorOperands(RegionBranchPoint point) {
+ReduceOp::getMutableSuccessorOperands(RegionSuccessor point) {
   // No operands are forwarded to the next iteration.
   return MutableOperandRange(getOperation(), /*start=*/0, /*length=*/0);
 }
@@ -3514,8 +3519,8 @@ Block::BlockArgListType WhileOp::getRegionIterArgs() {
   return getBeforeArguments();
 }
 
-OperandRange WhileOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBefore() &&
+OperandRange WhileOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBefore() &&
          "WhileOp is expected to branch only to the first region");
   return getInits();
 }
@@ -3528,15 +3533,18 @@ void WhileOp::getSuccessorRegions(RegionBranchPoint point,
     return;
   }
 
-  assert(llvm::is_contained({&getAfter(), &getBefore()}, point) &&
+  assert(llvm::is_contained(
+             {&getAfter(), &getBefore()},
+             point.getTerminatorPredecessorOrNull()->getParentRegion()) &&
          "there are only two regions in a WhileOp");
   // The body region always branches back to the condition region.
-  if (point == getAfter()) {
+  if (point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+      &getAfter()) {
     regions.emplace_back(&getBefore(), getBefore().getArguments());
     return;
   }
 
-  regions.emplace_back(getResults());
+  regions.emplace_back(getOperation(), getResults());
   regions.emplace_back(&getAfter(), getAfter().getArguments());
 }
 
@@ -4445,7 +4453,7 @@ void IndexSwitchOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &successors) {
   // All regions branch back to the parent op.
   if (!point.isParent()) {
-    successors.emplace_back(getResults());
+    successors.emplace_back(getOperation(), getResults());
     return;
   }
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp b/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp
index ae52af5..ddcbda8 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ForToWhile.cpp
@@ -23,7 +23,6 @@ namespace mlir {
 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
 } // namespace mlir
 
-using namespace llvm;
 using namespace mlir;
 using scf::ForOp;
 using scf::WhileOp;
diff --git a/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp
index a2f03f1..00bef70 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp
@@ -21,7 +21,6 @@ namespace mlir {
 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
 } // namespace mlir
 
-using namespace llvm;
 using namespace mlir;
 using scf::LoopNest;
 
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 5ba8289..f0f22e5 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -346,7 +346,7 @@ void AssumingOp::getSuccessorRegions(
   // parent, so return the correct RegionSuccessor purely based on the index
   // being None or 0.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 1a9d9e1..3962e3e 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -2597,7 +2597,7 @@ std::optional<MutableArrayRef<OpOperand>> IterateOp::getYieldedValuesMutable() {
 
 std::optional<ResultRange> IterateOp::getLoopResults() { return getResults(); }
 
-OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+OperandRange IterateOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   return getInitArgs();
 }
 
@@ -2607,7 +2607,7 @@ void IterateOp::getSuccessorRegions(RegionBranchPoint point,
   // or back into the operation itself.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
   // It is possible for loop not to enter the body.
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.push_back(RegionSuccessor(getOperation(), getResults()));
 }
 
 void CoIterateOp::build(OpBuilder &builder, OperationState &odsState,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
index f53d272..ffa8b40 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
@@ -152,19 +152,20 @@ IterationGraphSorter IterationGraphSorter::fromGenericOp(
 }
 
 IterationGraphSorter::IterationGraphSorter(
-    SmallVector<Value> &&ins, SmallVector<AffineMap> &&loop2InsLvl, Value out,
-    AffineMap loop2OutLvl, SmallVector<utils::IteratorType> &&iterTypes,
+    SmallVector<Value> &&insArg, SmallVector<AffineMap> &&loop2InsLvlArg,
+    Value out, AffineMap loop2OutLvl,
+    SmallVector<utils::IteratorType> &&iterTypesArg,
     sparse_tensor::LoopOrderingStrategy strategy)
-    : ins(std::move(ins)), loop2InsLvl(std::move(loop2InsLvl)), out(out),
-      loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypes)),
+    : ins(std::move(insArg)), loop2InsLvl(std::move(loop2InsLvlArg)), out(out),
+      loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypesArg)),
       strategy(strategy) {
   // One map per tensor.
-  assert(this->loop2InsLvl.size() == this->ins.size());
+  assert(loop2InsLvl.size() == ins.size());
   // All the affine maps have the same number of dimensions (loops).
   assert(llvm::all_equal(llvm::map_range(
-      this->loop2InsLvl, [](AffineMap m) { return m.getNumDims(); })));
+      loop2InsLvl, [](AffineMap m) { return m.getNumDims(); })));
   // The number of results of the map should match the rank of the tensor.
-  assert(llvm::all_of(llvm::zip(this->loop2InsLvl, this->ins), [](auto mvPair) {
+  assert(llvm::all_of(llvm::zip(loop2InsLvl, ins), [](auto mvPair) {
     auto [m, v] = mvPair;
 
     // For ranked types the rank must match.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
index b2a16e9..35e58ed 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
@@ -59,10 +59,10 @@ public:
 
 private:
   // Private constructor.
-  IterationGraphSorter(SmallVector<Value> &&ins,
-                       SmallVector<AffineMap> &&loop2InsLvl, Value out,
+  IterationGraphSorter(SmallVector<Value> &&insArg,
+                       SmallVector<AffineMap> &&loop2InsLvlArg, Value out,
                        AffineMap loop2OutLvl,
-                       SmallVector<utils::IteratorType> &&iterTypes,
+                       SmallVector<utils::IteratorType> &&iterTypesArg,
                        sparse_tensor::LoopOrderingStrategy strategy =
                            sparse_tensor::LoopOrderingStrategy::kDefault);
 
diff --git a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
index 1e3b377..549ac7a 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SwapExtractSliceWithProducerPatterns.cpp
@@ -77,7 +77,7 @@ FailureOr<TilingResult> tensor::replaceInsertSlicesWithTiledConsumer(
       dyn_cast<TilingInterface>(consumerOperands.front()->getOwner());
   if (!consumerOp)
     return failure();
-  for (auto opOperand : consumerOperands.drop_front()) {
+  for (auto *opOperand : consumerOperands.drop_front()) {
     if (opOperand->getOwner() != consumerOp) {
       LLVM_DEBUG({
         llvm::dbgs()
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 365afab..062606e 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -96,9 +96,9 @@ ensurePayloadIsSeparateFromTransform(transform::TransformOpInterface transform,
 // AlternativesOp
 //===----------------------------------------------------------------------===//
 
-OperandRange
-transform::AlternativesOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  if (!point.isParent() && getOperation()->getNumOperands() == 1)
+OperandRange transform::AlternativesOp::getEntrySuccessorOperands(
+    RegionSuccessor successor) {
+  if (!successor.isParent() && getOperation()->getNumOperands() == 1)
     return getOperation()->getOperands();
   return OperandRange(getOperation()->operand_end(),
                       getOperation()->operand_end());
@@ -107,15 +107,18 @@ transform::AlternativesOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 void transform::AlternativesOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   for (Region &alternative : llvm::drop_begin(
-           getAlternatives(),
-           point.isParent() ? 0
-                            : point.getRegionOrNull()->getRegionNumber() + 1)) {
+           getAlternatives(), point.isParent()
+                                  ? 0
+                                  : point.getTerminatorPredecessorOrNull()
+                                            ->getParentRegion()
+                                            ->getRegionNumber() +
+                                        1)) {
     regions.emplace_back(&alternative, !getOperands().empty()
                                            ? alternative.getArguments()
                                            : Block::BlockArgListType());
   }
   if (!point.isParent())
-    regions.emplace_back(getOperation()->getResults());
+    regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 void transform::AlternativesOp::getRegionInvocationBounds(
@@ -1740,16 +1743,18 @@ void transform::ForeachOp::getSuccessorRegions(
   }
 
   // Branch back to the region or the parent.
-  assert(point == getBody() && "unexpected region index");
+  assert(point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getBody() &&
+         "unexpected region index");
   regions.emplace_back(bodyRegion, bodyRegion->getArguments());
-  regions.emplace_back();
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 OperandRange
-transform::ForeachOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+transform::ForeachOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   // Each block argument handle is mapped to a subset (one op to be precise)
   // of the payload of the corresponding `targets` operand of ForeachOp.
-  assert(point == getBody() && "unexpected region index");
+  assert(successor.getSuccessor() == &getBody() && "unexpected region index");
   return getOperation()->getOperands();
 }
 
@@ -2948,8 +2953,8 @@ void transform::SequenceOp::getEffects(
 }
 
 OperandRange
-transform::SequenceOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBody() && "unexpected region index");
+transform::SequenceOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBody() && "unexpected region index");
   if (getOperation()->getNumOperands() > 0)
     return getOperation()->getOperands();
   return OperandRange(getOperation()->operand_end(),
@@ -2966,8 +2971,10 @@ void transform::SequenceOp::getSuccessorRegions(
     return;
   }
 
-  assert(point == getBody() && "unexpected region index");
-  regions.emplace_back(getOperation()->getResults());
+  assert(point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getBody() &&
+         "unexpected region index");
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 void transform::SequenceOp::getRegionInvocationBounds(
diff --git a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
index c627158..f727118 100644
--- a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "llvm/Support/Debug.h"
 
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h"
@@ -112,7 +113,7 @@ static void printAlternativesOpSelectedRegion(OpAsmPrinter &printer,
 }
 
 OperandRange transform::tune::AlternativesOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   // No operands will be forwarded to the region(s).
   return getOperands().slice(0, 0);
 }
@@ -128,7 +129,7 @@ void transform::tune::AlternativesOp::getSuccessorRegions(
       for (Region &alternative : getAlternatives())
         regions.emplace_back(&alternative, Block::BlockArgListType());
   else
-    regions.emplace_back(getOperation()->getResults());
+    regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 void transform::tune::AlternativesOp::getRegionInvocationBounds(
diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp
index 776b5c6..f4c9242 100644
--- a/mlir/lib/IR/Diagnostics.cpp
+++ b/mlir/lib/IR/Diagnostics.cpp
@@ -138,6 +138,10 @@ Diagnostic &Diagnostic::operator<<(Operation &op) {
   return appendOp(op, OpPrintingFlags());
 }
 
+Diagnostic &Diagnostic::operator<<(OpWithFlags op) {
+  return appendOp(*op.getOperation(), op.flags());
+}
+
 Diagnostic &Diagnostic::appendOp(Operation &op, const OpPrintingFlags &flags) {
   std::string str;
   llvm::raw_string_ostream os(str);
diff --git a/mlir/lib/IR/Region.cpp b/mlir/lib/IR/Region.cpp
index 46b6298..15a941f 100644
--- a/mlir/lib/IR/Region.cpp
+++ b/mlir/lib/IR/Region.cpp
@@ -253,6 +253,21 @@ void Region::OpIterator::skipOverBlocksWithNoOps() {
     operation = block->begin();
 }
 
+llvm::raw_ostream &mlir::operator<<(llvm::raw_ostream &os, Region &region) {
+  if (!region.getParentOp()) {
+    os << "Region has no parent op";
+  } else {
+    os << "Region #" << region.getRegionNumber() << " in operation "
+       << region.getParentOp()->getName();
+  }
+  for (auto it : llvm::enumerate(region.getBlocks())) {
+    os << "\n  Block #" << it.index() << ":";
+    for (Operation &op : it.value().getOperations())
+      os << "\n    " << OpWithFlags(&op, OpPrintingFlags().skipRegions());
+  }
+  return os;
+}
+
 //===----------------------------------------------------------------------===//
 // RegionRange
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index ca3f766..1e56810 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -9,7 +9,9 @@
 #include <utility>
 
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "llvm/Support/DebugLog.h"
 
 using namespace mlir;
 
@@ -38,20 +40,31 @@ SuccessorOperands::SuccessorOperands(unsigned int producedOperandCount,
 std::optional<BlockArgument>
 detail::getBranchSuccessorArgument(const SuccessorOperands &operands,
                                    unsigned operandIndex, Block *successor) {
+  LDBG() << "Getting branch successor argument for operand index "
+         << operandIndex << " in successor block";
+
   OperandRange forwardedOperands = operands.getForwardedOperands();
   // Check that the operands are valid.
-  if (forwardedOperands.empty())
+  if (forwardedOperands.empty()) {
+    LDBG() << "No forwarded operands, returning nullopt";
     return std::nullopt;
+  }
 
   // Check to ensure that this operand is within the range.
   unsigned operandsStart = forwardedOperands.getBeginOperandIndex();
   if (operandIndex < operandsStart ||
-      operandIndex >= (operandsStart + forwardedOperands.size()))
+      operandIndex >= (operandsStart + forwardedOperands.size())) {
+    LDBG() << "Operand index " << operandIndex << " out of range ["
+           << operandsStart << ", "
+           << (operandsStart + forwardedOperands.size())
+           << "), returning nullopt";
     return std::nullopt;
+  }
 
   // Index the successor.
   unsigned argIndex =
       operands.getProducedOperandCount() + operandIndex - operandsStart;
+  LDBG() << "Computed argument index " << argIndex << " for successor block";
   return successor->getArgument(argIndex);
 }
 
@@ -59,9 +72,15 @@ detail::getBranchSuccessorArgument(const SuccessorOperands &operands,
 LogicalResult
 detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
                                       const SuccessorOperands &operands) {
+  LDBG() << "Verifying branch successor operands for successor #" << succNo
+         << " in operation " << op->getName();
+
   // Check the count.
   unsigned operandCount = operands.size();
   Block *destBB = op->getSuccessor(succNo);
+  LDBG() << "Branch has " << operandCount << " operands, target block has "
+         << destBB->getNumArguments() << " arguments";
+
   if (operandCount != destBB->getNumArguments())
     return op->emitError() << "branch has " << operandCount
                            << " operands for successor #" << succNo
@@ -69,13 +88,22 @@ detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
                            << destBB->getNumArguments();
 
   // Check the types.
+  LDBG() << "Checking type compatibility for "
+         << (operandCount - operands.getProducedOperandCount())
+         << " forwarded operands";
   for (unsigned i = operands.getProducedOperandCount(); i != operandCount;
        ++i) {
-    if (!cast<BranchOpInterface>(op).areTypesCompatible(
-            operands[i].getType(), destBB->getArgument(i).getType()))
+    Type operandType = operands[i].getType();
+    Type argType = destBB->getArgument(i).getType();
+    LDBG() << "Checking type compatibility: operand type " << operandType
+           << " vs argument type " << argType;
+
+    if (!cast<BranchOpInterface>(op).areTypesCompatible(operandType, argType))
       return op->emitError() << "type mismatch for bb argument #" << i
                              << " of successor #" << succNo;
   }
+
+  LDBG() << "Branch successor operand verification successful";
   return success();
 }
 
@@ -126,15 +154,15 @@ LogicalResult detail::verifyRegionBranchWeights(Operation *op) {
 
 static InFlightDiagnostic &printRegionEdgeName(InFlightDiagnostic &diag,
                                                RegionBranchPoint sourceNo,
-                                               RegionBranchPoint succRegionNo) {
+                                               RegionSuccessor succRegionNo) {
   diag << "from ";
-  if (Region *region = sourceNo.getRegionOrNull())
-    diag << "Region #" << region->getRegionNumber();
+  if (Operation *op = sourceNo.getTerminatorPredecessorOrNull())
+    diag << "Operation " << op->getName();
   else
     diag << "parent operands";
 
   diag << " to ";
-  if (Region *region = succRegionNo.getRegionOrNull())
+  if (Region *region = succRegionNo.getSuccessor())
     diag << "Region #" << region->getRegionNumber();
   else
     diag << "parent results";
@@ -145,13 +173,12 @@ static InFlightDiagnostic &printRegionEdgeName(InFlightDiagnostic &diag,
 /// `sourcePoint`. `getInputsTypesForRegion` is a function that returns the
 /// types of the inputs that flow to a successor region.
 static LogicalResult
-verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
-                         function_ref<FailureOr<TypeRange>(RegionBranchPoint)>
+verifyTypesAlongAllEdges(RegionBranchOpInterface branchOp,
+                         RegionBranchPoint sourcePoint,
+                         function_ref<FailureOr<TypeRange>(RegionSuccessor)>
                              getInputsTypesForRegion) {
-  auto regionInterface = cast<RegionBranchOpInterface>(op);
-
   SmallVector<RegionSuccessor, 2> successors;
-  regionInterface.getSuccessorRegions(sourcePoint, successors);
+  branchOp.getSuccessorRegions(sourcePoint, successors);
 
   for (RegionSuccessor &succ : successors) {
     FailureOr<TypeRange> sourceTypes = getInputsTypesForRegion(succ);
@@ -160,10 +187,14 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
 
     TypeRange succInputsTypes = succ.getSuccessorInputs().getTypes();
     if (sourceTypes->size() != succInputsTypes.size()) {
-      InFlightDiagnostic diag = op->emitOpError("region control flow edge ");
+      InFlightDiagnostic diag =
+          branchOp->emitOpError("region control flow edge ");
+      std::string succStr;
+      llvm::raw_string_ostream os(succStr);
+      os << succ;
       return printRegionEdgeName(diag, sourcePoint, succ)
              << ": source has " << sourceTypes->size()
-             << " operands, but target successor needs "
+             << " operands, but target successor " << os.str() << " needs "
              << succInputsTypes.size();
     }
 
@@ -171,8 +202,10 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
          llvm::enumerate(llvm::zip(*sourceTypes, succInputsTypes))) {
       Type sourceType = std::get<0>(typesIdx.value());
       Type inputType = std::get<1>(typesIdx.value());
-      if (!regionInterface.areTypesCompatible(sourceType, inputType)) {
-        InFlightDiagnostic diag = op->emitOpError("along control flow edge ");
+
+      if (!branchOp.areTypesCompatible(sourceType, inputType)) {
+        InFlightDiagnostic diag =
+            branchOp->emitOpError("along control flow edge ");
         return printRegionEdgeName(diag, sourcePoint, succ)
                << ": source type #" << typesIdx.index() << " " << sourceType
                << " should match input type #" << typesIdx.index() << " "
@@ -180,6 +213,7 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
       }
     }
   }
+
   return success();
 }
 
@@ -187,34 +221,18 @@ verifyTypesAlongAllEdges(Operation *op, RegionBranchPoint sourcePoint,
 LogicalResult detail::verifyTypesAlongControlFlowEdges(Operation *op) {
   auto regionInterface = cast<RegionBranchOpInterface>(op);
 
-  auto inputTypesFromParent = [&](RegionBranchPoint point) -> TypeRange {
-    return regionInterface.getEntrySuccessorOperands(point).getTypes();
+  auto inputTypesFromParent = [&](RegionSuccessor successor) -> TypeRange {
+    return regionInterface.getEntrySuccessorOperands(successor).getTypes();
   };
 
   // Verify types along control flow edges originating from the parent.
-  if (failed(verifyTypesAlongAllEdges(op, RegionBranchPoint::parent(),
-                                      inputTypesFromParent)))
+  if (failed(verifyTypesAlongAllEdges(
+          regionInterface, RegionBranchPoint::parent(), inputTypesFromParent)))
     return failure();
 
-  auto areTypesCompatible = [&](TypeRange lhs, TypeRange rhs) {
-    if (lhs.size() != rhs.size())
-      return false;
-    for (auto types : llvm::zip(lhs, rhs)) {
-      if (!regionInterface.areTypesCompatible(std::get<0>(types),
-                                              std::get<1>(types))) {
-        return false;
-      }
-    }
-    return true;
-  };
-
   // Verify types along control flow edges originating from each region.
   for (Region &region : op->getRegions()) {
-
-    // Since there can be multiple terminators implementing the
-    // `RegionBranchTerminatorOpInterface`, all should have the same operand
-    // types when passing them to the same region.
-
+    // Collect all return-like terminators in the region.
     SmallVector<RegionBranchTerminatorOpInterface> regionReturnOps;
     for (Block &block : region)
       if (!block.empty())
@@ -227,33 +245,20 @@ LogicalResult detail::verifyTypesAlongControlFlowEdges(Operation *op) {
     if (regionReturnOps.empty())
       continue;
 
-    auto inputTypesForRegion =
-        [&](RegionBranchPoint point) -> FailureOr<TypeRange> {
-      std::optional<OperandRange> regionReturnOperands;
-      for (RegionBranchTerminatorOpInterface regionReturnOp : regionReturnOps) {
-        auto terminatorOperands = regionReturnOp.getSuccessorOperands(point);
-
-        if (!regionReturnOperands) {
-          regionReturnOperands = terminatorOperands;
-          continue;
-        }
-
-        // Found more than one ReturnLike terminator. Make sure the operand
-        // types match with the first one.
-        if (!areTypesCompatible(regionReturnOperands->getTypes(),
-                                terminatorOperands.getTypes())) {
-          InFlightDiagnostic diag = op->emitOpError("along control flow edge");
-          return printRegionEdgeName(diag, region, point)
-                 << " operands mismatch between return-like terminators";
-        }
-      }
-
-      // All successors get the same set of operand types.
-      return TypeRange(regionReturnOperands->getTypes());
-    };
-
-    if (failed(verifyTypesAlongAllEdges(op, region, inputTypesForRegion)))
-      return failure();
+    // Verify types along control flow edges originating from each return-like
+    // terminator.
+    for (RegionBranchTerminatorOpInterface regionReturnOp : regionReturnOps) {
+
+      auto inputTypesForRegion =
+          [&](RegionSuccessor successor) -> FailureOr<TypeRange> {
+        OperandRange terminatorOperands =
+            regionReturnOp.getSuccessorOperands(successor);
+        return TypeRange(terminatorOperands.getTypes());
+      };
+      if (failed(verifyTypesAlongAllEdges(regionInterface, regionReturnOp,
+                                          inputTypesForRegion)))
+        return failure();
+    }
   }
 
   return success();
@@ -272,31 +277,74 @@ using StopConditionFn = function_ref<bool(Region *, ArrayRef<bool> visited)>;
 static bool traverseRegionGraph(Region *begin,
                                 StopConditionFn stopConditionFn) {
   auto op = cast<RegionBranchOpInterface>(begin->getParentOp());
+  LDBG() << "Starting region graph traversal from region #"
+         << begin->getRegionNumber() << " in operation " << op->getName();
+
   SmallVector<bool> visited(op->getNumRegions(), false);
   visited[begin->getRegionNumber()] = true;
+  LDBG() << "Initialized visited array with " << op->getNumRegions()
+         << " regions";
 
   // Retrieve all successors of the region and enqueue them in the worklist.
   SmallVector<Region *> worklist;
   auto enqueueAllSuccessors = [&](Region *region) {
-    SmallVector<RegionSuccessor> successors;
-    op.getSuccessorRegions(region, successors);
-    for (RegionSuccessor successor : successors)
-      if (!successor.isParent())
-        worklist.push_back(successor.getSuccessor());
+    LDBG() << "Enqueuing successors for region #" << region->getRegionNumber();
+    SmallVector<Attribute> operandAttributes(op->getNumOperands());
+    for (Block &block : *region) {
+      if (block.empty())
+        continue;
+      auto terminator =
+          dyn_cast<RegionBranchTerminatorOpInterface>(block.back());
+      if (!terminator)
+        continue;
+      SmallVector<RegionSuccessor> successors;
+      operandAttributes.resize(terminator->getNumOperands());
+      terminator.getSuccessorRegions(operandAttributes, successors);
+      LDBG() << "Found " << successors.size()
+             << " successors from terminator in block";
+      for (RegionSuccessor successor : successors) {
+        if (!successor.isParent()) {
+          worklist.push_back(successor.getSuccessor());
+          LDBG() << "Added region #"
+                 << successor.getSuccessor()->getRegionNumber()
+                 << " to worklist";
+        } else {
+          LDBG() << "Skipping parent successor";
+        }
+      }
+    }
   };
   enqueueAllSuccessors(begin);
+  LDBG() << "Initial worklist size: " << worklist.size();
 
   // Process all regions in the worklist via DFS.
   while (!worklist.empty()) {
     Region *nextRegion = worklist.pop_back_val();
-    if (stopConditionFn(nextRegion, visited))
+    LDBG() << "Processing region #" << nextRegion->getRegionNumber()
+           << " from worklist (remaining: " << worklist.size() << ")";
+
+    if (stopConditionFn(nextRegion, visited)) {
+      LDBG() << "Stop condition met for region #"
+             << nextRegion->getRegionNumber() << ", returning true";
       return true;
-    if (visited[nextRegion->getRegionNumber()])
+    }
+    llvm::dbgs() << "Region: " << nextRegion << "\n";
+    if (!nextRegion->getParentOp()) {
+      llvm::errs() << "Region " << *nextRegion << " has no parent op\n";
+      return false;
+    }
+    if (visited[nextRegion->getRegionNumber()]) {
+      LDBG() << "Region #" << nextRegion->getRegionNumber()
+             << " already visited, skipping";
       continue;
+    }
     visited[nextRegion->getRegionNumber()] = true;
+    LDBG() << "Marking region #" << nextRegion->getRegionNumber()
+           << " as visited";
     enqueueAllSuccessors(nextRegion);
   }
 
+  LDBG() << "Traversal completed, returning false";
   return false;
 }
 
@@ -322,18 +370,26 @@ static bool isRegionReachable(Region *begin, Region *r) {
 ///    mutually exclusive if they are not reachable from each other as per
 ///    RegionBranchOpInterface::getSuccessorRegions.
 bool mlir::insideMutuallyExclusiveRegions(Operation *a, Operation *b) {
+  LDBG() << "Checking if operations are in mutually exclusive regions: "
+         << a->getName() << " and " << b->getName();
+
   assert(a && "expected non-empty operation");
   assert(b && "expected non-empty operation");
 
   auto branchOp = a->getParentOfType<RegionBranchOpInterface>();
   while (branchOp) {
+    LDBG() << "Checking branch operation " << branchOp->getName();
+
     // Check if b is inside branchOp. (We already know that a is.)
     if (!branchOp->isProperAncestor(b)) {
+      LDBG() << "Operation b is not inside branchOp, checking next ancestor";
       // Check next enclosing RegionBranchOpInterface.
       branchOp = branchOp->getParentOfType<RegionBranchOpInterface>();
       continue;
     }
 
+    LDBG() << "Both operations are inside branchOp, finding their regions";
+
     // b is contained in branchOp. Retrieve the regions in which `a` and `b`
     // are contained.
     Region *regionA = nullptr, *regionB = nullptr;
@@ -341,63 +397,136 @@ bool mlir::insideMutuallyExclusiveRegions(Operation *a, Operation *b) {
       if (r.findAncestorOpInRegion(*a)) {
         assert(!regionA && "already found a region for a");
         regionA = &r;
+        LDBG() << "Found region #" << r.getRegionNumber() << " for operation a";
       }
       if (r.findAncestorOpInRegion(*b)) {
         assert(!regionB && "already found a region for b");
         regionB = &r;
+        LDBG() << "Found region #" << r.getRegionNumber() << " for operation b";
       }
     }
     assert(regionA && regionB && "could not find region of op");
 
+    LDBG() << "Region A: #" << regionA->getRegionNumber() << ", Region B: #"
+           << regionB->getRegionNumber();
+
     // `a` and `b` are in mutually exclusive regions if both regions are
     // distinct and neither region is reachable from the other region.
-    return regionA != regionB && !isRegionReachable(regionA, regionB) &&
-           !isRegionReachable(regionB, regionA);
+    bool regionsAreDistinct = (regionA != regionB);
+    bool aNotReachableFromB = !isRegionReachable(regionA, regionB);
+    bool bNotReachableFromA = !isRegionReachable(regionB, regionA);
+
+    LDBG() << "Regions distinct: " << regionsAreDistinct
+           << ", A not reachable from B: " << aNotReachableFromB
+           << ", B not reachable from A: " << bNotReachableFromA;
+
+    bool mutuallyExclusive =
+        regionsAreDistinct && aNotReachableFromB && bNotReachableFromA;
+    LDBG() << "Operations are mutually exclusive: " << mutuallyExclusive;
+
+    return mutuallyExclusive;
   }
 
   // Could not find a common RegionBranchOpInterface among a's and b's
   // ancestors.
+  LDBG() << "No common RegionBranchOpInterface found, operations are not "
+            "mutually exclusive";
   return false;
 }
 
 bool RegionBranchOpInterface::isRepetitiveRegion(unsigned index) {
+  LDBG() << "Checking if region #" << index << " is repetitive in operation "
+         << getOperation()->getName();
+
   Region *region = &getOperation()->getRegion(index);
-  return isRegionReachable(region, region);
+  bool isRepetitive = isRegionReachable(region, region);
+
+  LDBG() << "Region #" << index << " is repetitive: " << isRepetitive;
+  return isRepetitive;
 }
 
 bool RegionBranchOpInterface::hasLoop() {
+  LDBG() << "Checking if operation " << getOperation()->getName()
+         << " has loops";
+
   SmallVector<RegionSuccessor> entryRegions;
   getSuccessorRegions(RegionBranchPoint::parent(), entryRegions);
-  for (RegionSuccessor successor : entryRegions)
-    if (!successor.isParent() &&
-        traverseRegionGraph(successor.getSuccessor(),
-                            [](Region *nextRegion, ArrayRef<bool> visited) {
-                              // Interrupt traversal if the region was already
-                              // visited.
-                              return visited[nextRegion->getRegionNumber()];
-                            }))
-      return true;
+  LDBG() << "Found " << entryRegions.size() << " entry regions";
+
+  for (RegionSuccessor successor : entryRegions) {
+    if (!successor.isParent()) {
+      LDBG() << "Checking entry region #"
+             << successor.getSuccessor()->getRegionNumber() << " for loops";
+
+      bool hasLoop =
+          traverseRegionGraph(successor.getSuccessor(),
+                              [](Region *nextRegion, ArrayRef<bool> visited) {
+                                // Interrupt traversal if the region was already
+                                // visited.
+                                return visited[nextRegion->getRegionNumber()];
+                              });
+
+      if (hasLoop) {
+        LDBG() << "Found loop in entry region #"
+               << successor.getSuccessor()->getRegionNumber();
+        return true;
+      }
+    } else {
+      LDBG() << "Skipping parent successor";
+    }
+  }
+
+  LDBG() << "No loops found in operation";
   return false;
 }
 
 Region *mlir::getEnclosingRepetitiveRegion(Operation *op) {
+  LDBG() << "Finding enclosing repetitive region for operation "
+         << op->getName();
+
   while (Region *region = op->getParentRegion()) {
+    LDBG() << "Checking region #" << region->getRegionNumber()
+           << " in operation " << region->getParentOp()->getName();
+
     op = region->getParentOp();
-    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op))
-      if (branchOp.isRepetitiveRegion(region->getRegionNumber()))
+    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op)) {
+      LDBG()
+          << "Found RegionBranchOpInterface, checking if region is repetitive";
+      if (branchOp.isRepetitiveRegion(region->getRegionNumber())) {
+        LDBG() << "Found repetitive region #" << region->getRegionNumber();
         return region;
+      }
+    } else {
+      LDBG() << "Parent operation does not implement RegionBranchOpInterface";
+    }
   }
+
+  LDBG() << "No enclosing repetitive region found";
   return nullptr;
 }
 
 Region *mlir::getEnclosingRepetitiveRegion(Value value) {
+  LDBG() << "Finding enclosing repetitive region for value";
+
   Region *region = value.getParentRegion();
   while (region) {
+    LDBG() << "Checking region #" << region->getRegionNumber()
+           << " in operation " << region->getParentOp()->getName();
+
     Operation *op = region->getParentOp();
-    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op))
-      if (branchOp.isRepetitiveRegion(region->getRegionNumber()))
+    if (auto branchOp = dyn_cast<RegionBranchOpInterface>(op)) {
+      LDBG()
+          << "Found RegionBranchOpInterface, checking if region is repetitive";
+      if (branchOp.isRepetitiveRegion(region->getRegionNumber())) {
+        LDBG() << "Found repetitive region #" << region->getRegionNumber();
         return region;
+      }
+    } else {
+      LDBG() << "Parent operation does not implement RegionBranchOpInterface";
+    }
     region = op->getParentRegion();
   }
+
+  LDBG() << "No enclosing repetitive region found for value";
   return nullptr;
 }
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index e0c65b0..41f3f9d 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -432,8 +432,7 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
 
   // Return the successors of `region` if the latter is not null. Else return
   // the successors of `regionBranchOp`.
-  auto getSuccessors = [&](Region *region = nullptr) {
-    auto point = region ? region : RegionBranchPoint::parent();
+  auto getSuccessors = [&](RegionBranchPoint point) {
     SmallVector<RegionSuccessor> successors;
     regionBranchOp.getSuccessorRegions(point, successors);
     return successors;
@@ -456,7 +455,8 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
   // `nonForwardedOperands`.
   auto markNonForwardedOperands = [&](BitVector &nonForwardedOperands) {
     nonForwardedOperands.resize(regionBranchOp->getNumOperands(), true);
-    for (const RegionSuccessor &successor : getSuccessors()) {
+    for (const RegionSuccessor &successor :
+         getSuccessors(RegionBranchPoint::parent())) {
       for (OpOperand *opOperand : getForwardedOpOperands(successor))
         nonForwardedOperands.reset(opOperand->getOperandNumber());
     }
@@ -469,10 +469,13 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
         for (Region &region : regionBranchOp->getRegions()) {
           if (region.empty())
             continue;
+          // TODO: this isn't correct in face of multiple terminators.
           Operation *terminator = region.front().getTerminator();
           nonForwardedRets[terminator] =
               BitVector(terminator->getNumOperands(), true);
-          for (const RegionSuccessor &successor : getSuccessors(&region)) {
+          for (const RegionSuccessor &successor :
+               getSuccessors(RegionBranchPoint(
+                   cast<RegionBranchTerminatorOpInterface>(terminator)))) {
             for (OpOperand *opOperand :
                  getForwardedOpOperands(successor, terminator))
               nonForwardedRets[terminator].reset(opOperand->getOperandNumber());
@@ -489,8 +492,13 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
           DenseMap<Region *, BitVector> &argsToKeep, Region *region = nullptr) {
         Operation *terminator =
             region ? region->front().getTerminator() : nullptr;
+        RegionBranchPoint point =
+            terminator
+                ? RegionBranchPoint(
+                      cast<RegionBranchTerminatorOpInterface>(terminator))
+                : RegionBranchPoint::parent();
 
-        for (const RegionSuccessor &successor : getSuccessors(region)) {
+        for (const RegionSuccessor &successor : getSuccessors(point)) {
           Region *successorRegion = successor.getSuccessor();
           for (auto [opOperand, input] :
                llvm::zip(getForwardedOpOperands(successor, terminator),
@@ -517,7 +525,8 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
         resultsOrArgsToKeepChanged = false;
 
         // Recompute `resultsToKeep` and `argsToKeep` based on `operandsToKeep`.
-        for (const RegionSuccessor &successor : getSuccessors()) {
+        for (const RegionSuccessor &successor :
+             getSuccessors(RegionBranchPoint::parent())) {
           Region *successorRegion = successor.getSuccessor();
           for (auto [opOperand, input] :
                llvm::zip(getForwardedOpOperands(successor),
@@ -551,7 +560,9 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
           if (region.empty())
             continue;
           Operation *terminator = region.front().getTerminator();
-          for (const RegionSuccessor &successor : getSuccessors(&region)) {
+          for (const RegionSuccessor &successor :
+               getSuccessors(RegionBranchPoint(
+                   cast<RegionBranchTerminatorOpInterface>(terminator)))) {
             Region *successorRegion = successor.getSuccessor();
             for (auto [opOperand, input] :
                  llvm::zip(getForwardedOpOperands(successor, terminator),
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir
new file mode 100644
index 0000000..bcbdef0
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx1250.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --allow-unregistered-dialect | FileCheck %s
+
+// CHECK-LABEL: @wmma_k4
+func.func @wmma_k4(%arg0 : vector<2xf32>, %arg1 : vector<8xf32>) {
+  // CHECK: rocdl.wmma.f32.16x16x4.f32 %arg0, %arg0, %arg1
+  amdgpu.wmma 16x16x4 %arg0 * %arg0 + %arg1 : vector<2xf32>, vector<2xf32>, vector<8xf32>
+  func.return
+}
+
+// CHECK-LABEL: @wmma_k32
+func.func @wmma_k32(%arg0 : vector<16xf16>, %arg1 : vector<16xbf16>, %arg2 : vector<8xf32>,
+                    %arg3 : vector<8xf16>, %arg4 : vector<8xbf16>) {
+  // CHECK: rocdl.wmma.f32.16x16x32.f16 %arg0, %arg0, %arg2
+  amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg2 : vector<16xf16>, vector<16xf16>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x32.f16 %arg0, %arg0, {{.*}} : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg3 : vector<16xf16>, vector<16xf16>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x32.bf16 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x32 %arg1 * %arg1 + %arg2 : vector<16xbf16>, vector<16xbf16>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.bf16.16x16x32.bf16 {{.*}}, {{.*}}, {{.*}}, {{.*}} : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1)
+  amdgpu.wmma 16x16x32 %arg1 * %arg1 + %arg4 : vector<16xbf16>, vector<16xbf16>, vector<8xbf16>
+
+  func.return
+}
+
+// CHECK-LABEL: @wmma_k64
+func.func @wmma_k64(%arg0 : vector<32xi8>, %arg1 : vector<32xf8E4M3FN>, %arg2 : vector<32xf8E5M2>,
+                    %arg3 : vector<8xi32>, %arg4 : vector<8xf32>, %arg5 : vector<8xf16>) {
+  // CHECK: rocdl.wmma.i32.16x16x64.iu8 {{.*}}, {{.*}}, {{.*}}, {{.*}}, %arg3, {{.*}}
+  amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg3 {clamp} : vector<32xi8>, vector<32xi8>, vector<8xi32>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.fp8_fp8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg1 * %arg1 + %arg4 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.fp8_fp8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg1 * %arg1 + %arg5 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.fp8_bf8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg1 * %arg2 + %arg4 : vector<32xf8E4M3FN>, vector<32xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.fp8_bf8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg1 * %arg2 + %arg5 : vector<32xf8E4M3FN>, vector<32xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.bf8_bf8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg2 * %arg2 + %arg4 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.bf8_bf8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg2 * %arg2 + %arg5 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x64.bf8_fp8 {{.*}}, {{.*}}, %arg4
+  amdgpu.wmma 16x16x64 %arg2 * %arg1 + %arg4 : vector<32xf8E5M2>, vector<32xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x64.bf8_fp8 {{.*}}, {{.*}}, %arg5, {{.*}} : (vector<8xi32>, vector<8xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x64 %arg2 * %arg1 + %arg5 : vector<32xf8E5M2>, vector<32xf8E4M3FN>, vector<8xf16>
+
+  func.return
+}
+
+// CHECK-LABEL: @wmma_k128
+func.func @wmma_k128(%arg0 : vector<64xf8E4M3FN>, %arg1 : vector<64xf8E5M2>,
+                     %arg2 : vector<8xf32>, %arg3 : vector<8xf16>) {
+  // CHECK: rocdl.wmma.f32.16x16x128.fp8_fp8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg2 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.fp8_fp8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg0 * %arg0 + %arg3 : vector<64xf8E4M3FN>, vector<64xf8E4M3FN>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x128.fp8_bf8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg0 * %arg1 + %arg2 : vector<64xf8E4M3FN>, vector<64xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.fp8_bf8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg0 * %arg1 + %arg3 : vector<64xf8E4M3FN>, vector<64xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x128.bf8_bf8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg1 * %arg1 + %arg2 : vector<64xf8E5M2>, vector<64xf8E5M2>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.bf8_bf8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg1 * %arg1 + %arg3 : vector<64xf8E5M2>, vector<64xf8E5M2>, vector<8xf16>
+
+  // CHECK: rocdl.wmma.f32.16x16x128.bf8_fp8 {{.*}}, {{.*}}, %arg2
+  amdgpu.wmma 16x16x128 %arg1 * %arg0 + %arg2 : vector<64xf8E5M2>, vector<64xf8E4M3FN>, vector<8xf32>
+
+  // CHECK: rocdl.wmma.f16.16x16x128.bf8_fp8 {{.*}}, {{.*}}, %arg3, {{.*}} : (vector<16xi32>, vector<16xi32>, vector<8xf16>, i1)
+  amdgpu.wmma 16x16x128 %arg1 * %arg0 + %arg3 : vector<64xf8E5M2>, vector<64xf8E4M3FN>, vector<8xf16>
+
+  func.return
+}
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index dec62f9..7a82236 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -211,11 +211,25 @@ func.func @complex_exp(%arg: complex<f32>) -> complex<f32> {
 }
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] : f32
+// CHECK-DAG: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 : f32
+// CHECK-DAG: %[[INF:.*]] = arith.constant 0x7F800000 : f32
 // CHECK-DAG: %[[EXP_REAL:.*]] = math.exp %[[REAL]] : f32
-// CHECK-DAG: %[[RESULT_REAL:.]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] : f32
+// CHECK-DAG: %[[REAL_HALF:.*]] = arith.mulf %[[REAL]], %[[HALF]] : f32
+// CHECK-DAG: %[[EXP_HALF:.*]] = math.exp %[[REAL_HALF]] : f32
+// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] : f32
 // CHECK-DAG: %[[SIN_IMAG:.*]] = math.sin %[[IMAG]] : f32
-// CHECK-DAG: %[[RESULT_IMAG:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] : f32
+// CHECK-DAG: %[[IS_INF:.*]] = arith.cmpf oeq, %[[EXP_REAL]], %[[INF]] : f32
+// CHECK-DAG: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK-DAG: %[[REAL_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] : f32
+// CHECK-DAG: %[[EXP_HALF_COS:.*]] = arith.mulf %[[EXP_HALF]], %[[COS_IMAG]] : f32
+// CHECK-DAG: %[[REAL_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_COS]], %[[EXP_HALF]] : f32
+// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[IS_INF]], %[[REAL_OVERFLOW]], %[[REAL_NORMAL]] : f32
+// CHECK-DAG: %[[IMAG_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] : f32
+// CHECK-DAG: %[[EXP_HALF_SIN:.*]] = arith.mulf %[[EXP_HALF]], %[[SIN_IMAG]] : f32
+// CHECK-DAG: %[[IMAG_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_SIN]], %[[EXP_HALF]] : f32
+// CHECK-DAG: %[[IMAG_NONZERO:.*]] = arith.select %[[IS_INF]], %[[IMAG_OVERFLOW]], %[[IMAG_NORMAL]] : f32
+// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[ZERO]], %[[IMAG_NONZERO]] : f32
 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex<f32>
 // CHECK: return %[[RESULT]] : complex<f32>
 
@@ -832,11 +846,25 @@ func.func @complex_exp_with_fmf(%arg: complex<f32>) -> complex<f32> {
 }
 // CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
 // CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
-// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 : f32
+// CHECK-DAG: %[[INF:.*]] = arith.constant 0x7F800000 : f32
 // CHECK-DAG: %[[EXP_REAL:.*]] = math.exp %[[REAL]] fastmath<nnan,contract> : f32
-// CHECK-DAG: %[[RESULT_REAL:.]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[REAL_HALF:.*]] = arith.mulf %[[REAL]], %[[HALF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[EXP_HALF:.*]] = math.exp %[[REAL_HALF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[COS_IMAG:.*]] = math.cos %[[IMAG]] fastmath<nnan,contract> : f32
 // CHECK-DAG: %[[SIN_IMAG:.*]] = math.sin %[[IMAG]] fastmath<nnan,contract> : f32
-// CHECK-DAG: %[[RESULT_IMAG:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IS_INF:.*]] = arith.cmpf oeq, %[[EXP_REAL]], %[[INF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IS_IMAG_ZERO:.*]] = arith.cmpf oeq, %[[IMAG]], %[[ZERO]] : f32
+// CHECK-DAG: %[[REAL_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[COS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[EXP_HALF_COS:.*]] = arith.mulf %[[EXP_HALF]], %[[COS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[REAL_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_COS]], %[[EXP_HALF]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[IS_INF]], %[[REAL_OVERFLOW]], %[[REAL_NORMAL]] : f32
+// CHECK-DAG: %[[IMAG_NORMAL:.*]] = arith.mulf %[[EXP_REAL]], %[[SIN_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[EXP_HALF_SIN:.*]] = arith.mulf %[[EXP_HALF]], %[[SIN_IMAG]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IMAG_OVERFLOW:.*]] = arith.mulf %[[EXP_HALF_SIN]], %[[EXP_HALF]] fastmath<nnan,contract> : f32
+// CHECK-DAG: %[[IMAG_NONZERO:.*]] = arith.select %[[IS_INF]], %[[IMAG_OVERFLOW]], %[[IMAG_NORMAL]] : f32
+// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[IS_IMAG_ZERO]], %[[ZERO]], %[[IMAG_NONZERO]] : f32
 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex<f32>
 // CHECK: return %[[RESULT]] : complex<f32>
 
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 5784764..4c6f62a 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -156,14 +156,6 @@ func.func @wmma_no_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector
 
 // -----
 
-func.func @wmma_wrong_m_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
-  // expected-error@+1 {{'amdgpu.wmma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
-  %0 = amdgpu.wmma 32x16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
-  func.return %0 : vector<8xi32>
-}
-
-// -----
-
 func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
   // expected-error@+1 {{'amdgpu.wmma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
   %0 = amdgpu.wmma 16x32x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
@@ -173,14 +165,62 @@ func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vec
 // -----
 
 func.func @wmma_wrong_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
-  // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {4, 16, 32, 64, 128}}}
   %0 = amdgpu.wmma 16x16x24 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
   func.return %0 : vector<8xi32>
 }
 
 // -----
 
-// Missinng `resetOffset`
+func.func @wmma_source_length_mismatch(%arg0 : vector<8xf16>, %arg1 : vector<16xf16>, %arg2 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op source vectors have different lengths}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xf16>, vector<16xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_mismatched_float_types(%arg0 : vector<8xf16>, %arg1 : vector<8xbf16>, %arg2 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op source element types must match (except for fp8/bf8)}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_mismatched_int_types(%arg0 : vector<8xi8>, %arg1 : vector<8xi4>, %arg2 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op source element types must match (except for fp8/bf8)}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg1 + %arg2 : vector<8xi8>, vector<8xi4>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_clamp_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op clamp flag is not supported for float types}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {clamp} : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_unsignedA_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op unsigned flags are not supported for float types}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {unsignedA} : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_unsignedB_float(%arg0 : vector<8xf16>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op unsigned flags are not supported for float types}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 {unsignedB} : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+// Missing `resetOffset`
 func.func @fat_raw_buffer_cast_stripped_offset(%m: memref<8xi32, strided<[1], offset: ?>, #gpu.address_space<global>>) -> memref<8xi32, #amdgpu.address_space<fat_raw_buffer>> {
   // expected-error@+1 {{'amdgpu.fat_raw_buffer_cast' op expected result type to be 'memref<8xi32, strided<[1], offset: ?>, #amdgpu.address_space<fat_raw_buffer>>' but got 'memref<8xi32, #amdgpu.address_space<fat_raw_buffer>>'}}
   %ret = amdgpu.fat_raw_buffer_cast %m : memref<8xi32, strided<[1], offset: ?>, #gpu.address_space<global>> to memref<8xi32, #amdgpu.address_space<fat_raw_buffer>>
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index a330967..09134cb 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -586,6 +586,41 @@ func.func @wmma_i32_16x16x32_i4(%arg0 : vector<16xi4>, %arg1 : vector<8xi32>) ->
   func.return %0 : vector<8xi32>
 }
 
+// CHECK-LABEL: func @wmma_f32_16x16x4_f32
+func.func @wmma_f32_16x16x4_f32(%arg0 : vector<2xf32>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // CHECK: amdgpu.wmma 16x16x4
+  %0 = amdgpu.wmma 16x16x4 %arg0 * %arg0 + %arg1 : vector<2xf32>, vector<2xf32>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @wmma_f32_16x16x64_f8
+func.func @wmma_f32_16x16x64_f8(%arg0 : vector<32xf8E4M3FN>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @wmma_f32_16x16x64_bf8
+func.func @wmma_f32_16x16x64_bf8(%arg0 : vector<32xf8E5M2>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @wmma_f16_16x16x64_bf8
+func.func @wmma_f16_16x16x64_bf8(%arg0 : vector<32xf8E5M2>, %arg1 : vector<8xf16>) -> vector<8xf16> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<8xf16>
+  func.return %0 : vector<8xf16>
+}
+
+// CHECK-LABEL: func @wmma_f16_16x16x64_f8
+func.func @wmma_f16_16x16x64_f8(%arg0 : vector<32xf8E4M3FN>, %arg1 : vector<8xf16>) -> vector<8xf16> {
+  // CHECK: amdgpu.wmma 16x16x64
+  %0 = amdgpu.wmma 16x16x64 %arg0 * %arg0 + %arg1 : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf16>
+  func.return %0 : vector<8xf16>
+}
+
 // CHECK-LABEL: func @swizzle_bitmode
 func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
   // CHECK: amdgpu.swizzle_bitmode
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir
index 40a57b9..e8bb0c0 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-memoryeffect-interface.mlir
@@ -156,3 +156,24 @@ func.func @manual_deallocation(%c: i1, %f: f32, %idx: index) -> f32 {
 //       CHECK:   cf.assert %[[true]], "expected that the block does not have ownership"
 //       CHECK:   memref.dealloc %[[manual_alloc]]
 //       CHECK:   bufferization.dealloc (%[[managed_alloc]] : memref<5xf32>) if (%[[true]])
+
+// -----
+
+// CHECK-LABEL: func.func private @properly_creates_deallocations_in_execute_region(
+// CHECK:           %[[true:.*]] = arith.constant true
+// CHECK:           scf.execute_region no_inline {
+// CHECK:             %[[alloc:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x63x378x16xui8>
+// CHECK:             bufferization.dealloc (%[[alloc]] : memref<1x63x378x16xui8>) if (%[[true]])
+
+func.func private @properly_creates_deallocations_in_execute_region(%arg1: memref<1x16x252x380xui8> ) -> (memref<1x250x378x16xui8> )  {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x250x378x16xui8>
+  scf.execute_region no_inline {
+    %subview = memref.subview %arg1[0, 0, 0, 0] [1, 16, 65, 380] [1, 1, 1, 1] : memref<1x16x252x380xui8> to memref<1x16x65x380xui8, strided<[1532160, 95760, 380, 1]>>
+    %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x63x378x16xui8>    
+    test.buffer_based in(%subview: memref<1x16x65x380xui8, strided<[1532160, 95760, 380, 1]>>) out(%alloc_3: memref<1x63x378x16xui8>)
+    %subview_7 = memref.subview %alloc[0, 0, 0, 0] [1, 63, 378, 16] [1, 1, 1, 1] : memref<1x250x378x16xui8> to memref<1x63x378x16xui8, strided<[1512000, 6048, 16, 1]>>
+    test.copy(%alloc_3, %subview_7) : (memref<1x63x378x16xui8>, memref<1x63x378x16xui8, strided<[1512000, 6048, 16, 1]>>)
+    scf.yield
+  }
+  return %alloc : memref<1x250x378x16xui8>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index d5f834b..8db1ebb 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -381,15 +381,19 @@ func.func private @execute_region_test(%t1 : tensor<?xf32>)
 // -----
 
 // CHECK-LABEL: func @no_inline_execute_region_not_canonicalized
-func.func @no_inline_execute_region_not_canonicalized() {
-  %c = arith.constant 42 : i32
-  // CHECK: scf.execute_region
-  // CHECK-SAME: no_inline
-  %v = scf.execute_region -> i32 no_inline {
-    scf.yield %c : i32
+module {
+  func.func private @foo()->()
+  func.func @no_inline_execute_region_not_canonicalized() {
+    %c = arith.constant 42 : i32
+    // CHECK: scf.execute_region
+    // CHECK-SAME: no_inline
+    %v = scf.execute_region -> i32 no_inline {
+      func.call @foo():()->()
+      scf.yield %c : i32
+    }
+    // CHECK: return
+    return
   }
-  // CHECK: return
-  return
 }
 
 // -----
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index d270ee8..e703600 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -664,6 +664,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+// CHECK-LABEL @rocdl.tensor.load.to.lds
+llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                    %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: rocdl.tensor.load.to.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL @rocdl.tensor.store.from.lds
+llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                       %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: rocdl.tensor.store.from.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL @rocdl.tensor.load.to.lds.d2
+llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: rocdl.tensor.load.to.lds.d2 %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL @rocdl.tensor.store.from.lds.d2
+llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: rocdl.tensor.store.from.lds.d2 %{{.*}}, %{{.*}} cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
 llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
                                   %stride : i16,
                                   %numRecords : i64,
diff --git a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
index f9b81df..d0aec68 100644
--- a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
@@ -77,6 +77,24 @@ func.func @memref_expand(%m: memref<?xf32>, %sz: index) -> (index, index) {
 
 // -----
 
+//       CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)>
+// CHECK-LABEL: func @memref_collapse(
+//  CHECK-SAME:     %[[sz0:.*]]: index
+//   CHECK-DAG:   %[[c2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[c12:.*]] = arith.constant 12 : index
+//       CHECK:   %[[dim:.*]] = memref.dim %{{.*}}, %[[c2]] : memref<3x4x?x2xf32>
+//       CHECK:   %[[mul:.*]] = affine.apply #[[$MAP]]()[%[[dim]]]
+//       CHECK:   return %[[c12]], %[[mul]]
+func.func @memref_collapse(%sz0: index) -> (index, index) {
+  %0 = memref.alloc(%sz0) : memref<3x4x?x2xf32>
+  %1 = memref.collapse_shape %0 [[0, 1], [2, 3]] : memref<3x4x?x2xf32> into memref<12x?xf32>
+  %2 = "test.reify_bound"(%1) {dim = 0} : (memref<12x?xf32>) -> (index)
+  %3 = "test.reify_bound"(%1) {dim = 1} : (memref<12x?xf32>) -> (index)
+  return %2, %3 : index, index
+}
+
+// -----
+
 // CHECK-LABEL: func @memref_get_global(
 //       CHECK:   %[[c4:.*]] = arith.constant 4 : index
 //       CHECK:   return %[[c4]]
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 77d18da..042ee25 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -2243,3 +2243,76 @@ func.func @test_firstprivate_map(%arg0: memref<10xf32>) {
 // CHECK-NEXT:     acc.yield
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
+
+// -----
+
+func.func @test_kernel_environment(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+
+  // Create data clause operands for the kernel environment
+  %copyin = acc.copyin varPtr(%arg0 : memref<1024xf32>) -> memref<1024xf32>
+  %create = acc.create varPtr(%arg1 : memref<1024xf32>) -> memref<1024xf32>
+
+  // Kernel environment wraps gpu.launch and captures data mapping
+  acc.kernel_environment dataOperands(%copyin, %create : memref<1024xf32>, memref<1024xf32>) {
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+               threads(%tx, %ty, %tz) in (%block_x = %c1024, %block_y = %c1, %block_z = %c1) {
+      // Kernel body uses the mapped data
+      %val = memref.load %copyin[%tx] : memref<1024xf32>
+      %result = arith.mulf %val, %val : f32
+      memref.store %result, %create[%tx] : memref<1024xf32>
+      gpu.terminator
+    }
+  }
+
+  // Copy results back to host and deallocate device memory
+  acc.copyout accPtr(%create : memref<1024xf32>) to varPtr(%arg1 : memref<1024xf32>)
+  acc.delete accPtr(%copyin : memref<1024xf32>)
+
+  return
+}
+
+// CHECK-LABEL: func @test_kernel_environment
+// CHECK:         %[[COPYIN:.*]] = acc.copyin varPtr(%{{.*}} : memref<1024xf32>) -> memref<1024xf32>
+// CHECK:         %[[CREATE:.*]] = acc.create varPtr(%{{.*}} : memref<1024xf32>) -> memref<1024xf32>
+// CHECK:         acc.kernel_environment dataOperands(%[[COPYIN]], %[[CREATE]] : memref<1024xf32>, memref<1024xf32>) {
+// CHECK:           gpu.launch
+// CHECK:             memref.load %[[COPYIN]]
+// CHECK:             memref.store %{{.*}}, %[[CREATE]]
+// CHECK:           }
+// CHECK:         }
+// CHECK:         acc.copyout accPtr(%[[CREATE]] : memref<1024xf32>) to varPtr(%{{.*}} : memref<1024xf32>)
+// CHECK:         acc.delete accPtr(%[[COPYIN]] : memref<1024xf32>)
+
+// -----
+
+func.func @test_kernel_environment_with_async(%arg0: memref<1024xf32>) {
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %async_val = arith.constant 1 : i32
+
+  %create = acc.create varPtr(%arg0 : memref<1024xf32>) async(%async_val : i32) -> memref<1024xf32>
+
+  // Kernel environment with async clause
+  acc.kernel_environment dataOperands(%create : memref<1024xf32>) async(%async_val : i32) {
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+               threads(%tx, %ty, %tz) in (%block_x = %c1024, %block_y = %c1, %block_z = %c1) {
+      %f0 = arith.constant 0.0 : f32
+      memref.store %f0, %create[%tx] : memref<1024xf32>
+      gpu.terminator
+    }
+  }
+
+  acc.copyout accPtr(%create : memref<1024xf32>) async(%async_val : i32) to varPtr(%arg0 : memref<1024xf32>)
+
+  return
+}
+
+// CHECK-LABEL: func @test_kernel_environment_with_async
+// CHECK:         %[[ASYNC:.*]] = arith.constant 1 : i32
+// CHECK:         %[[CREATE:.*]] = acc.create varPtr(%{{.*}} : memref<1024xf32>) async(%[[ASYNC]] : i32) -> memref<1024xf32>
+// CHECK:         acc.kernel_environment dataOperands(%[[CREATE]] : memref<1024xf32>) async(%[[ASYNC]] : i32)
+// CHECK:           gpu.launch
+// CHECK:             memref.store %{{.*}}, %[[CREATE]]
+// CHECK:         acc.copyout accPtr(%[[CREATE]] : memref<1024xf32>) async(%[[ASYNC]] : i32) to varPtr(%{{.*}} : memref<1024xf32>)
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index 37fc86b..3f481ad 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -373,7 +373,7 @@ func.func @reduceReturn_not_inside_reduce(%arg0 : f32) {
 
 func.func @std_if_incorrect_yield(%arg0: i1, %arg1: f32)
 {
-  // expected-error@+1 {{region control flow edge from Region #0 to parent results: source has 1 operands, but target successor needs 2}}
+  // expected-error@+1 {{region control flow edge from Operation scf.yield to parent results: source has 1 operands, but target successor <to parent> needs 2}}
   %x, %y = scf.if %arg0 -> (f32, f32) {
     %0 = arith.addf %arg1, %arg1 : f32
     scf.yield %0 : f32
@@ -544,7 +544,7 @@ func.func @while_invalid_terminator() {
 
 func.func @while_cross_region_type_mismatch() {
   %true = arith.constant true
-  // expected-error@+1 {{'scf.while' op region control flow edge from Region #0 to Region #1: source has 0 operands, but target successor needs 1}}
+  // expected-error@+1 {{region control flow edge from Operation scf.condition to Region #1: source has 0 operands, but target successor <to region #1 with 1 inputs> needs 1}}
   scf.while : () -> () {
     scf.condition(%true)
   } do {
@@ -557,7 +557,7 @@ func.func @while_cross_region_type_mismatch() {
 
 func.func @while_cross_region_type_mismatch() {
   %true = arith.constant true
-  // expected-error@+1 {{'scf.while' op along control flow edge from Region #0 to Region #1: source type #0 'i1' should match input type #0 'i32'}}
+  // expected-error@+1 {{along control flow edge from Operation scf.condition to Region #1: source type #0 'i1' should match input type #0 'i32'}}
   %0 = scf.while : () -> (i1) {
     scf.condition(%true) %true : i1
   } do {
@@ -570,7 +570,7 @@ func.func @while_cross_region_type_mismatch() {
 
 func.func @while_result_type_mismatch() {
   %true = arith.constant true
-  // expected-error@+1 {{'scf.while' op region control flow edge from Region #0 to parent results: source has 1 operands, but target successor needs 0}}
+  // expected-error@+1 {{region control flow edge from Operation scf.condition to parent results: source has 1 operands, but target successor <to parent> needs 0}}
   scf.while : () -> () {
     scf.condition(%true) %true : i1
   } do {
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index 1bcef0a..ea587e9 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -49,6 +49,11 @@ func.func @conj(%arg: complex<f32>) -> complex<f32> {
   func.return %conj : complex<f32>
 }
 
+func.func @exp(%arg: complex<f32>) -> complex<f32> {
+  %exp = complex.exp %arg : complex<f32>
+  func.return %exp : complex<f32>
+}
+
 // %input contains pairs of lhs, rhs, i.e. [lhs_0, rhs_0, lhs_1, rhs_1,...]
 func.func @test_binary(%input: tensor<?xcomplex<f32>>,
                        %func: (complex<f32>, complex<f32>) -> complex<f32>) {
@@ -353,5 +358,32 @@ func.func @entry() {
   call @test_element_f64(%abs_test_cast, %abs_func)
     : (tensor<?xcomplex<f64>>, (complex<f64>) -> f64) -> ()
 
+  // complex.exp test
+  %exp_test = arith.constant dense<[
+    (1.0, 2.0),
+    // CHECK:      -1.1312
+    // CHECK-NEXT:  2.4717
+
+    // The first case to consider is overflow of exp(real_part). If computed
+    // directly, this yields inf * 0 = NaN, which is incorrect.
+    (500.0, 0.0),
+    // CHECK-NEXT:  inf
+    // CHECK-NOT:   nan
+    // CHECK-NEXT:  0
+
+    // In this case, the overflow of exp(real_part) is compensated when
+    // sin(imag_part) is close to zero, yielding a finite imaginary part.
+    (90.0238094, 5.900613e-39)
+    // CHECK-NEXT:  inf
+    // CHECK-NOT:   inf
+    // CHECK-NEXT:  7.3746
+  ]> : tensor<3xcomplex<f32>>
+  %exp_test_cast = tensor.cast %exp_test
+    :  tensor<3xcomplex<f32>> to tensor<?xcomplex<f32>>
+
+  %exp_func = func.constant @exp : (complex<f32>) -> complex<f32>
+  call @test_unary(%exp_test_cast, %exp_func)
+    : (tensor<?xcomplex<f32>>, (complex<f32>) -> complex<f32>) -> ()
+
   func.return
 }
diff --git a/mlir/test/Target/LLVMIR/ptr.mlir b/mlir/test/Target/LLVMIR/ptr.mlir
index 473ac05..94b6628 100644
--- a/mlir/test/Target/LLVMIR/ptr.mlir
+++ b/mlir/test/Target/LLVMIR/ptr.mlir
@@ -284,8 +284,8 @@ llvm.func @ptr_add_cst() -> !ptr.ptr<#llvm.address_space<0>> {
 
 // CHECK-LABEL: define i64 @ptr_diff_scalar
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
@@ -296,8 +296,8 @@ llvm.func @ptr_diff_scalar(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr.
 
 // CHECK-LABEL: define i32 @ptr_diff_scalar_i32
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   %[[TRUNC:.*]] = trunc i64 %[[DIFF]] to i32
 // CHECK-NEXT:   ret i32 %[[TRUNC]]
@@ -309,8 +309,8 @@ llvm.func @ptr_diff_scalar_i32(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !
 
 // CHECK-LABEL: define <4 x i64> @ptr_diff_vector
 // CHECK-SAME: (<4 x ptr> %[[PTRS1:.*]], <4 x ptr> %[[PTRS2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint <4 x ptr> %[[PTRS1]] to <4 x i64>
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint <4 x ptr> %[[PTRS2]] to <4 x i64>
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr <4 x ptr> %[[PTRS1]] to <4 x i64>
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr <4 x ptr> %[[PTRS2]] to <4 x i64>
 // CHECK-NEXT:   %[[DIFF:.*]] = sub <4 x i64> %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret <4 x i64> %[[DIFF]]
 // CHECK-NEXT: }
@@ -321,8 +321,8 @@ llvm.func @ptr_diff_vector(%ptrs1: vector<4x!ptr.ptr<#llvm.address_space<0>>>, %
 
 // CHECK-LABEL: define <8 x i32> @ptr_diff_vector_i32
 // CHECK-SAME: (<8 x ptr> %[[PTRS1:.*]], <8 x ptr> %[[PTRS2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint <8 x ptr> %[[PTRS1]] to <8 x i64>
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint <8 x ptr> %[[PTRS2]] to <8 x i64>
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr <8 x ptr> %[[PTRS1]] to <8 x i64>
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr <8 x ptr> %[[PTRS2]] to <8 x i64>
 // CHECK-NEXT:   %[[DIFF:.*]] = sub <8 x i64> %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   %[[TRUNC:.*]] = trunc <8 x i64> %[[DIFF]] to <8 x i32>
 // CHECK-NEXT:   ret <8 x i32> %[[TRUNC]]
@@ -344,8 +344,8 @@ llvm.func @ptr_diff_with_constants() -> i64 {
 
 // CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub nsw i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
@@ -356,8 +356,8 @@ llvm.func @ptr_diff_with_flags_nsw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr
 
 // CHECK-LABEL: define i64 @ptr_diff_with_flags_nuw
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub nuw i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
@@ -368,8 +368,8 @@ llvm.func @ptr_diff_with_flags_nuw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr
 
 // CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw_nuw
 // CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) {
-// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64
-// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64
+// CHECK-NEXT:   %[[P1INT:.*]] = ptrtoaddr ptr %[[PTR1]] to i64
+// CHECK-NEXT:   %[[P2INT:.*]] = ptrtoaddr ptr %[[PTR2]] to i64
 // CHECK-NEXT:   %[[DIFF:.*]] = sub nuw nsw i64 %[[P1INT]], %[[P2INT]]
 // CHECK-NEXT:   ret i64 %[[DIFF]]
 // CHECK-NEXT: }
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 30126f6..8a848221 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,6 +1040,36 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.tensor.load.to.lds
+llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                    %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL: rocdl.tensor.store.from.lds
+llvm.func @rocdl.tensor.store.from.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
+                                       %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.store.from.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL: rocdl.tensor.load.to.lds.d2
+llvm.func @rocdl.tensor.load.to.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.load.to.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
+// CHECK-LABEL: rocdl.tensor.store.from.lds.d2
+llvm.func @rocdl.tensor.store.from.lds.d2(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>) {
+  // CHECK: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i32 0)
+  rocdl.tensor.store.from.lds.d2 %dgroup0, %dgroup1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+  llvm.return
+}
+
 llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
                                   %stride : i16,
                                   %numRecords : i64,
diff --git a/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir b/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir
new file mode 100644
index 0000000..62d15de
--- /dev/null
+++ b/mlir/test/Target/SPIRV/decorations-intel-cache-controls.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip --verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls() "None" {
+    // CHECK: spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
+    %0 = spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
+    // CHECK: spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
+    %1 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls_invalid_type() "None" {
+    // expected-error@below {{expecting array attribute of CacheControlLoadINTEL for CacheControlLoadINTEL}}
+    %0 = spirv.Variable {cache_control_load_intel = #spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls_invalid_type() "None" {
+    // expected-error@below {{expecting array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
+    %0 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, 0 : i32]} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
+  spirv.func @cache_controls_invalid_type() "None" {
+    // expected-error@below {{expecting non-empty array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
+    %0 = spirv.Variable {cache_control_store_intel = []} : !spirv.ptr<f32, Function>
+    spirv.Return
+  }
+}
diff --git a/mlir/test/Target/SPIRV/decorations.mlir b/mlir/test/Target/SPIRV/decorations.mlir
index 90ba690e..712fd17 100644
--- a/mlir/test/Target/SPIRV/decorations.mlir
+++ b/mlir/test/Target/SPIRV/decorations.mlir
@@ -1,27 +1,32 @@
-// RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: location = 0 : i32
   spirv.GlobalVariable @var {location = 0 : i32} : !spirv.ptr<vector<4xf32>, Input>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: no_perspective
   spirv.GlobalVariable @var {no_perspective} : !spirv.ptr<vector<4xf32>, Input>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: flat
   spirv.GlobalVariable @var {flat} : !spirv.ptr<si32, Input>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: aliased
   // CHECK: aliased
   spirv.GlobalVariable @var1 bind(0, 0) {aliased} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
@@ -30,28 +35,28 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: non_readable
   spirv.GlobalVariable @var bind(0, 0) {non_readable} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: non_writable
   spirv.GlobalVariable @var bind(0, 0) {non_writable} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_variable_pointers]> {
   // CHECK: restrict
   spirv.GlobalVariable @var bind(0, 0) {restrict} : !spirv.ptr<!spirv.struct<(!spirv.array<4xf32, stride=4>[0])>, StorageBuffer>
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: relaxed_precision
   spirv.GlobalVariable @var {location = 0 : i32, relaxed_precision} : !spirv.ptr<vector<4xf32>, Output>
 }
@@ -84,7 +89,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
+spirv.module Logical OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage], [SPV_KHR_no_integer_wrap_decoration]> {
 spirv.func @iadd_decorations(%arg: i32) -> i32 "None" {
   // CHECK: spirv.IAdd %{{.*}}, %{{.*}} {no_signed_wrap, no_unsigned_wrap}
   %0 = spirv.IAdd %arg, %arg {no_signed_wrap, no_unsigned_wrap} : i32
@@ -94,7 +99,7 @@ spirv.func @iadd_decorations(%arg: i32) -> i32 "None" {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
+spirv.module Logical OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage], []> {
 spirv.func @fadd_decorations(%arg: f32) -> f32 "None" {
   // CHECK: spirv.FAdd %{{.*}}, %{{.*}} {fp_fast_math_mode = #spirv.fastmath_mode<NotNaN|NotInf|NSZ>}
   %0 = spirv.FAdd %arg, %arg {fp_fast_math_mode = #spirv.fastmath_mode<NotNaN|NotInf|NSZ>} : f32
@@ -104,7 +109,7 @@ spirv.func @fadd_decorations(%arg: f32) -> f32 "None" {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
 spirv.func @fmul_decorations(%arg: f32) -> f32 "None" {
   // CHECK: spirv.FMul %{{.*}}, %{{.*}} {no_contraction}
   %0 = spirv.FMul %arg, %arg {no_contraction} : f32
@@ -114,7 +119,7 @@ spirv.func @fmul_decorations(%arg: f32) -> f32 "None" {
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Kernel, Float16], []> {
+spirv.module Logical OpenCL requires #spirv.vce<v1.0, [Kernel, Linkage, Float16], []> {
 spirv.func @fp_rounding_mode(%arg: f32) -> f16 "None" {
   // CHECK: spirv.FConvert %arg0 {fp_rounding_mode = #spirv.fp_rounding_mode<RTN>} : f32 to f16
   %0 = spirv.FConvert %arg {fp_rounding_mode = #spirv.fp_rounding_mode<RTN>} : f32 to f16
@@ -124,51 +129,7 @@ spirv.func @fp_rounding_mode(%arg: f32) -> f16 "None" {
 
 // -----
 
-// CHECK-LABEL: spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls() "None" {
-    // CHECK: spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
-    %0 = spirv.Variable {cache_control_load_intel = [#spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>, #spirv.cache_control_load_intel<cache_level = 1, load_cache_control = Cached>, #spirv.cache_control_load_intel<cache_level = 2, load_cache_control = InvalidateAfterR>]} : !spirv.ptr<f32, Function>
-    // CHECK: spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
-    %1 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, #spirv.cache_control_store_intel<cache_level = 1, store_cache_control = WriteThrough>, #spirv.cache_control_store_intel<cache_level = 2, store_cache_control = WriteBack>]} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls_invalid_type() "None" {
-    // expected-error@below {{expecting array attribute of CacheControlLoadINTEL for CacheControlLoadINTEL}}
-    %0 = spirv.Variable {cache_control_load_intel = #spirv.cache_control_load_intel<cache_level = 0, load_cache_control = Uncached>} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls_invalid_type() "None" {
-    // expected-error@below {{expecting array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
-    %0 = spirv.Variable {cache_control_store_intel = [#spirv.cache_control_store_intel<cache_level = 0, store_cache_control = Uncached>, 0 : i32]} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [CacheControlsINTEL], [SPV_INTEL_cache_controls]> {
-  spirv.func @cache_controls_invalid_type() "None" {
-    // expected-error@below {{expecting non-empty array attribute of CacheControlStoreINTEL for CacheControlStoreINTEL}}
-    %0 = spirv.Variable {cache_control_store_intel = []} : !spirv.ptr<f32, Function>
-    spirv.Return
-  }
-}
-
-// -----
-
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
   // CHECK: spirv.func @relaxed_precision_arg({{%.*}}: !spirv.ptr<f32, Function> {spirv.decoration = #spirv.decoration<RelaxedPrecision>}) "None" attributes {relaxed_precision} {
   spirv.func @relaxed_precision_arg(%arg0: !spirv.ptr<f32, Function> {spirv.decoration = #spirv.decoration<RelaxedPrecision>}) -> () "None" attributes {relaxed_precision} {
     spirv.Return
diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
index eb0d980..7a7a583 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
@@ -66,7 +66,7 @@ public:
 
   void visitRegionBranchControlFlowTransfer(RegionBranchOpInterface branch,
                                             RegionBranchPoint regionFrom,
-                                            RegionBranchPoint regionTo,
+                                            RegionSuccessor regionTo,
                                             const NextAccess &after,
                                             NextAccess *before) override;
 
@@ -240,7 +240,7 @@ void NextAccessAnalysis::visitCallControlFlowTransfer(
 
 void NextAccessAnalysis::visitRegionBranchControlFlowTransfer(
     RegionBranchOpInterface branch, RegionBranchPoint regionFrom,
-    RegionBranchPoint regionTo, const NextAccess &after, NextAccess *before) {
+    RegionSuccessor regionTo, const NextAccess &after, NextAccess *before) {
   LDBG() << "visitRegionBranchControlFlowTransfer: "
          << OpWithFlags(branch.getOperation(), OpPrintingFlags().skipRegions());
   LDBG() << "  regionFrom: " << (regionFrom.isParent() ? "parent" : "region");
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index b211e24..4d4ec02 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -633,8 +633,9 @@ ParseResult RegionIfOp::parse(OpAsmParser &parser, OperationState &result) {
                                 parser.getCurrentLocation(), result.operands);
 }
 
-OperandRange RegionIfOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(llvm::is_contained({&getThenRegion(), &getElseRegion()}, point) &&
+OperandRange RegionIfOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(llvm::is_contained({&getThenRegion(), &getElseRegion()},
+                            successor.getSuccessor()) &&
          "invalid region index");
   return getOperands();
 }
@@ -643,10 +644,11 @@ void RegionIfOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
   // We always branch to the join region.
   if (!point.isParent()) {
-    if (point != getJoinRegion())
+    if (point.getTerminatorPredecessorOrNull()->getParentRegion() !=
+        &getJoinRegion())
       regions.push_back(RegionSuccessor(&getJoinRegion(), getJoinArgs()));
     else
-      regions.push_back(RegionSuccessor(getResults()));
+      regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -673,7 +675,7 @@ void AnyCondOp::getSuccessorRegions(RegionBranchPoint point,
   if (point.isParent())
     regions.emplace_back(&getRegion());
   else
-    regions.emplace_back(getResults());
+    regions.emplace_back(getOperation(), getResults());
 }
 
 void AnyCondOp::getRegionInvocationBounds(
@@ -1107,11 +1109,11 @@ void LoopBlockOp::getSuccessorRegions(
   if (point.isParent())
     return;
 
-  regions.emplace_back((*this)->getResults());
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
-OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionBranchPoint point) {
-  assert(point == getBody());
+OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  assert(successor.getSuccessor() == &getBody());
   return MutableOperandRange(getInitMutable());
 }
 
@@ -1120,8 +1122,8 @@ OperandRange LoopBlockOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 //===----------------------------------------------------------------------===//
 
 MutableOperandRange
-LoopBlockTerminatorOp::getMutableSuccessorOperands(RegionBranchPoint point) {
-  if (point.isParent())
+LoopBlockTerminatorOp::getMutableSuccessorOperands(RegionSuccessor successor) {
+  if (successor.isParent())
     return getExitArgMutable();
   return getNextIterArgMutable();
 }
@@ -1213,7 +1215,7 @@ void TestStoreWithARegion::getSuccessorRegions(
   if (point.isParent())
     regions.emplace_back(&getBody(), getBody().front().getArguments());
   else
-    regions.emplace_back();
+    regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1227,7 +1229,7 @@ void TestStoreWithALoopRegion::getSuccessorRegions(
   // enter the body.
   regions.emplace_back(
       RegionSuccessor(&getBody(), getBody().front().getArguments()));
-  regions.emplace_back();
+  regions.emplace_back(getOperation(), getOperation()->getResults());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 05a33cf..a3430ba 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2581,7 +2581,7 @@ def LoopBlockTerminatorOp : TEST_Op<"loop_block_term",
 
 def TestNoTerminatorOp : TEST_Op<"switch_with_no_break", [
     NoTerminator,
-    DeclareOpInterfaceMethods<RegionBranchOpInterface, ["getSuccessorRegions"]>
+    DeclareOpInterfaceMethods<RegionBranchOpInterface>
   ]> {
   let arguments = (ins Index:$arg, DenseI64ArrayAttr:$cases);
   let regions = (region VariadicRegion<SizedRegion<1>>:$caseRegions);
diff --git a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
index f1aae15..2e6950f 100644
--- a/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
+++ b/mlir/unittests/Interfaces/ControlFlowInterfacesTest.cpp
@@ -13,17 +13,24 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Parser/Parser.h"
+#include "llvm/Support/DebugLog.h"
 
 #include <gtest/gtest.h>
 
 using namespace mlir;
 
 /// A dummy op that is also a terminator.
-struct DummyOp : public Op<DummyOp, OpTrait::IsTerminator> {
+struct DummyOp : public Op<DummyOp, OpTrait::IsTerminator, OpTrait::ZeroResults,
+                           OpTrait::ZeroSuccessors,
+                           RegionBranchTerminatorOpInterface::Trait> {
   using Op::Op;
   static ArrayRef<StringRef> getAttributeNames() { return {}; }
 
   static StringRef getOperationName() { return "cftest.dummy_op"; }
+
+  MutableOperandRange getMutableSuccessorOperands(RegionSuccessor point) {
+    return MutableOperandRange(getOperation(), 0, 0);
+  }
 };
 
 /// All regions of this op are mutually exclusive.
@@ -39,6 +46,8 @@ struct MutuallyExclusiveRegionsOp
   // Regions have no successors.
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {}
+  using RegionBranchOpInterface::Trait<
+      MutuallyExclusiveRegionsOp>::getSuccessorRegions;
 };
 
 /// All regions of this op call each other in a large circle.
@@ -53,13 +62,18 @@ struct LoopRegionsOp
 
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {
-    if (Region *region = point.getRegionOrNull()) {
-      if (point == (*this)->getRegion(1))
+    if (point.getTerminatorPredecessorOrNull()) {
+      Region *region =
+          point.getTerminatorPredecessorOrNull()->getParentRegion();
+      if (region == &(*this)->getRegion(1))
         // This region also branches back to the parent.
-        regions.push_back(RegionSuccessor());
+        regions.push_back(
+            RegionSuccessor(getOperation()->getParentOp(),
+                            getOperation()->getParentOp()->getResults()));
       regions.push_back(RegionSuccessor(region));
     }
   }
+  using RegionBranchOpInterface::Trait<LoopRegionsOp>::getSuccessorRegions;
 };
 
 /// Each region branches back it itself or the parent.
@@ -75,11 +89,17 @@ struct DoubleLoopRegionsOp
 
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {
-    if (Region *region = point.getRegionOrNull()) {
-      regions.push_back(RegionSuccessor());
+    if (point.getTerminatorPredecessorOrNull()) {
+      Region *region =
+          point.getTerminatorPredecessorOrNull()->getParentRegion();
+      regions.push_back(
+          RegionSuccessor(getOperation()->getParentOp(),
+                          getOperation()->getParentOp()->getResults()));
       regions.push_back(RegionSuccessor(region));
     }
   }
+  using RegionBranchOpInterface::Trait<
+      DoubleLoopRegionsOp>::getSuccessorRegions;
 };
 
 /// Regions are executed sequentially.
@@ -93,11 +113,15 @@ struct SequentialRegionsOp
   // Region 0 has Region 1 as a successor.
   void getSuccessorRegions(RegionBranchPoint point,
                            SmallVectorImpl<RegionSuccessor> &regions) {
-    if (point == (*this)->getRegion(0)) {
+    if (point.getTerminatorPredecessorOrNull() &&
+        point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+            &(*this)->getRegion(0)) {
       Operation *thisOp = this->getOperation();
       regions.push_back(RegionSuccessor(&thisOp->getRegion(1)));
     }
   }
+  using RegionBranchOpInterface::Trait<
+      SequentialRegionsOp>::getSuccessorRegions;
 };
 
 /// A dialect putting all the above together.
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index d528dae..e8561cc 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -12086,6 +12086,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/TuneExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/TuneExtension/*.h"]),
     deps = [
+        ":ControlFlowInterfaces",
         ":IR",
         ":TransformDialect",
         ":TransformDialectInterfaces",