[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfcubsan_minimal-clang-format-a-file

Created using spr 1.3.4 [skip ci]
author: Vitaly Buka <vitalybuka@google.com> 2025-05-08 00:08:20 -0700
committer: Vitaly Buka <vitalybuka@google.com> 2025-05-08 00:08:20 -0700
commit: 9ffd039f04cfa9bc5d3371b9c8cf86376a74c21e (patch)
tree: b16242994b810591bb9121e7a3fc272f6fb7a9c6
parent: d3d32f47328770ca72997086d44a59397d1ea015 (diff)
parent: c1f0e68cec4218c9d51a4ad0a6f6d878ed573dfe (diff)
download: llvm-users/vitalybuka/spr/main.nfcubsan_minimal-clang-format-a-file.zip
llvm-users/vitalybuka/spr/main.nfcubsan_minimal-clang-format-a-file.tar.gz
llvm-users/vitalybuka/spr/main.nfcubsan_minimal-clang-format-a-file.tar.bz2
630 files changed, 16832 insertions, 5814 deletions
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 79a9186..c4ee75e 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -197,10 +197,6 @@ private:
 
   BoltAddressTranslation *BAT{nullptr};
 
-  /// Whether pre-aggregated profile needs to convert branch profile into call
-  /// to continuation fallthrough profile.
-  bool NeedsConvertRetProfileToCallCont{false};
-
   /// Update function execution profile with a recorded trace.
   /// A trace is region of code executed between two LBR entries supplied in
   /// execution order.
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index a8a1879..80f4ea0 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -720,23 +720,6 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
                : isReturn(Func.disassembleInstructionAtOffset(Offset));
   };
 
-  // Returns whether \p Offset in \p Func may be a call continuation excluding
-  // entry points and landing pads.
-  auto checkCallCont = [&](const BinaryFunction &Func, const uint64_t Offset) {
-    // No call continuation at a function start.
-    if (!Offset)
-      return false;
-
-    // FIXME: support BAT case where the function might be in empty state
-    // (split fragments declared non-simple).
-    if (!Func.hasCFG())
-      return false;
-
-    // The offset should not be an entry point or a landing pad.
-    const BinaryBasicBlock *ContBB = Func.getBasicBlockAtOffset(Offset);
-    return ContBB && !ContBB->isEntryPoint() && !ContBB->isLandingPad();
-  };
-
   // Mutates \p Addr to an offset into the containing function, performing BAT
   // offset translation and parent lookup.
   //
@@ -749,8 +732,7 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
 
     Addr -= Func->getAddress();
 
-    bool IsRetOrCallCont =
-        IsFrom ? checkReturn(*Func, Addr) : checkCallCont(*Func, Addr);
+    bool IsRet = IsFrom && checkReturn(*Func, Addr);
 
     if (BAT)
       Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
@@ -761,24 +743,16 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
       NumColdSamples += Count;
 
     if (!ParentFunc)
-      return std::pair{Func, IsRetOrCallCont};
+      return std::pair{Func, IsRet};
 
-    return std::pair{ParentFunc, IsRetOrCallCont};
+    return std::pair{ParentFunc, IsRet};
   };
 
-  uint64_t ToOrig = To;
   auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
-  auto [ToFunc, IsCallCont] = handleAddress(To, /*IsFrom*/ false);
+  auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
   if (!FromFunc && !ToFunc)
     return false;
 
-  // Record call to continuation trace.
-  if (NeedsConvertRetProfileToCallCont && FromFunc != ToFunc &&
-      (IsReturn || IsCallCont)) {
-    LBREntry First{ToOrig - 1, ToOrig - 1, false};
-    LBREntry Second{ToOrig, ToOrig, false};
-    return doTrace(First, Second, Count);
-  }
   // Ignore returns.
   if (IsReturn)
     return true;
@@ -1235,21 +1209,14 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
   if (std::error_code EC = TypeOrErr.getError())
     return EC;
-  // Pre-aggregated profile with branches and fallthroughs needs to convert
-  // return profile into call to continuation fall-through.
-  auto Type = AggregatedLBREntry::BRANCH;
-  if (TypeOrErr.get() == "B") {
-    NeedsConvertRetProfileToCallCont = true;
+  auto Type = AggregatedLBREntry::TRACE;
+  if (LLVM_LIKELY(TypeOrErr.get() == "T")) {
+  } else if (TypeOrErr.get() == "B") {
     Type = AggregatedLBREntry::BRANCH;
   } else if (TypeOrErr.get() == "F") {
-    NeedsConvertRetProfileToCallCont = true;
     Type = AggregatedLBREntry::FT;
   } else if (TypeOrErr.get() == "f") {
-    NeedsConvertRetProfileToCallCont = true;
     Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
-  } else if (TypeOrErr.get() == "T") {
-    // Trace is expanded into B and [Ff]
-    Type = AggregatedLBREntry::TRACE;
   } else {
     reportError("expected T, B, F or f");
     return make_error_code(llvm::errc::io_error);
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index ee72d8f..44e3bf2 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -4,31 +4,12 @@
 # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
 ## Link against a DSO to ensure PLT entries.
 # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
-# RUN: link_fdata %s %t %t.pa1 PREAGG1
-# RUN: link_fdata %s %t %t.pa2 PREAGG2
-# RUN: link_fdata %s %t %t.pa3 PREAGG3
 # RUN: link_fdata %s %t %t.pat PREAGGT1
 # RUN: link_fdata %s %t %t.pat2 PREAGGT2
 # RUN: link_fdata %s %t %t.patplt PREAGGPLT
 
-## Check normal case: fallthrough is not LP or secondary entry.
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
-# RUN: llvm-bolt %t.strip --pa -p %t.pa1 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s
-
-## Check that getFallthroughsInTrace correctly handles a trace starting at plt
-## call continuation
-# RUN: llvm-bolt %t.strip --pa -p %t.pa2 -o %t.out2 \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
-
-## Check that we don't treat secondary entry points as call continuation sites.
-# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
-
-## Check fallthrough to a landing pad case.
-# RUN: llvm-bolt %t.strip --pa -p %t.pa3 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
 
 ## Check pre-aggregated traces attach call continuation fallthrough count
 # RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
@@ -77,7 +58,6 @@ Ltmp0_br:
 ## Check PLT traces are accepted
 # PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
 ## Target is an external-origin call continuation
-# PREAGG1: B X:0 #Ltmp1# 2 0
 # PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
 # CHECK:      callq puts@PLT
 # CHECK-NEXT: count: 2
@@ -97,18 +77,15 @@ Ltmp4_br:
 	movl	$0xa, -0x18(%rbp)
 	callq	foo
 ## Target is a binary-local call continuation
-# PREAGG1: B #Lfoo_ret# #Ltmp3# 1 0
 # PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 # CHECK:      callq foo
 # CHECK-NEXT: count: 1
 
 ## PLT call continuation fallthrough spanning the call
-# PREAGG2: F #Ltmp1# #Ltmp3_br# 3
 # CHECK2:      callq foo
 # CHECK2-NEXT: count: 3
 
 ## Target is a secondary entry point (unstripped) or a landing pad (stripped)
-# PREAGG3: B X:0 #Ltmp3# 2 0
 # PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
 # CHECK3:      callq foo
 # CHECK3-NEXT: count: 0
diff --git a/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp b/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp
index 850df7d..3e367ab 100644
--- a/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp
+++ b/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp
@@ -113,7 +113,7 @@ static SourceLocation getStartOfNextLine(SourceLocation Loc,
                                          const SourceManager &SM,
                                          const LangOptions &LangOpts) {
   std::unique_ptr<Lexer> Lex = getLexerStartingFromLoc(Loc, SM, LangOpts);
-  if (!Lex.get())
+  if (!Lex)
     return SourceLocation();
   llvm::SmallVector<char, 16> Line;
   // FIXME: this is a bit hacky to get ReadToEndOfLine work.
@@ -647,9 +647,8 @@ static SourceLocation getLocAfterNamespaceLBrace(const NamespaceDecl *NsDecl,
                                                  const LangOptions &LangOpts) {
   std::unique_ptr<Lexer> Lex =
       getLexerStartingFromLoc(NsDecl->getBeginLoc(), SM, LangOpts);
-  assert(Lex.get() &&
-         "Failed to create lexer from the beginning of namespace.");
-  if (!Lex.get())
+  assert(Lex && "Failed to create lexer from the beginning of namespace.");
+  if (!Lex)
     return SourceLocation();
   Token Tok;
   while (!Lex->LexFromRawLexer(Tok) && Tok.isNot(tok::TokenKind::l_brace)) {
diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h
index b0b34fc..9e4484c 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -60,6 +60,7 @@ struct CommentInfo {
   // the vector.
   bool operator<(const CommentInfo &Other) const;
 
+  // TODO: The Kind field should be an enum, so we can switch on it easily.
   SmallString<16>
       Kind; // Kind of comment (FullComment, ParagraphComment, TextComment,
             // InlineCommandComment, HTMLStartTagComment, HTMLEndTagComment,
@@ -415,7 +416,13 @@ struct TypedefInfo : public SymbolInfo {
 
   TypeInfo Underlying;
 
-  // Inidicates if this is a new C++ "using"-style typedef:
+  // Underlying type declaration
+  SmallString<16> TypeDeclaration;
+
+  /// Comment description for the typedef.
+  std::vector<CommentInfo> Description;
+
+  // Indicates if this is a new C++ "using"-style typedef:
   //   using MyVector = std::vector<int>
   // False means it's a C-style typedef:
   //   typedef std::vector<int> MyVector;
@@ -458,7 +465,8 @@ struct EnumValueInfo {
   // constant. This will be empty for implicit enumeration values.
   SmallString<16> ValueExpr;
 
-  std::vector<CommentInfo> Description; /// Comment description of this field.
+  /// Comment description of this field.
+  std::vector<CommentInfo> Description;
 };
 
 // TODO: Expand to allow for documenting templating.
@@ -527,6 +535,7 @@ struct ClangDocContext {
   std::vector<std::string> UserStylesheets;
   // JavaScript files that will be imported in all HTML files.
   std::vector<std::string> JsScripts;
+  // Base directory for remote repositories.
   StringRef Base;
   Index Idx;
 };
diff --git a/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css b/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css
new file mode 100644
index 0000000..a885a36
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css
@@ -0,0 +1,471 @@
+/* css for clang-doc mustache backend */
+@import "https://fonts.googleapis.com/css2?family=Inter:ital,opsz,wght@0,14..32,100..900;1,14..32,100..900&display=swap";
+
+*,*::before *::after {
+    box-sizing:border-box
+}
+* {
+    margin:0;
+    padding:0
+}
+ol,
+ul {
+    list-style:none
+}
+img,
+picture,
+svg,
+video {
+    display:block;
+    max-width:100%
+}
+
+* {
+    --brand-light:#ce6300;
+    --text1-light:#000000;
+    --text2-light:#333333;
+    --surface1-light:#ffffff;
+    --surface2-light:#f5f5f5;
+    --brand-dark:#de9853;
+    --text1-dark:#ffffff;
+    --text2-dark:#cccccc;
+    --surface1-dark:#161212;
+    --surface2-dark:#272424
+}
+
+:root {
+    color-scheme:light;
+    --brand:var(--brand-light);
+    --text1:var(--text1-light);
+    --text2:var(--text2-light);
+    --text1-inverse:var(--text1-dark);
+    --text2-inverse:var(--text2-dark);
+    --surface1:var(--surface1-light);
+    --surface2:var(--surface2-light)
+}
+
+@media(prefers-color-scheme:dark) {
+    :root {
+        color-scheme:dark;
+        --brand:var(--brand-dark);
+        --text1:var(--text1-dark);
+        --text2:var(--text2-dark);
+        --text1-inverse:var(--text1-light);
+        --text2-inverse:var(--text2-light);
+        --surface1:var(--surface1-dark);
+        --surface2:var(--surface2-dark)
+    }
+}
+
+[color-scheme=light] {
+    color-scheme:light;
+    --brand:var(--brand-light);
+    --text1:var(--text1-light);
+    --text2:var(--text2-light);
+    --text1-inverse:var(--text1-dark);
+    --text2-inverse:var(--text2-dark);
+    --surface1:var(--surface1-light);
+    --surface2:var(--surface2-light)
+}
+
+[color-scheme=dark] {
+    color-scheme:dark;
+    --brand:var(--brand-dark);
+    --text1:var(--text1-dark);
+    --text2:var(--text2-dark);
+    --text1-inverse:var(--text1-light);
+    --text2-inverse:var(--text2-light);
+    --surface1:var(--surface1-dark);
+    --surface2:var(--surface2-dark)
+}
+
+html {
+    background-color:var(--surface1)
+}
+
+html, body {
+    min-height: 100vh;
+    margin: 0;
+    padding: 0;
+    width: 100%;
+}
+
+.container {
+    display: flex;
+    margin-top: 60px;
+    height: calc(100% - 60px);
+    box-sizing: border-box;
+}
+
+body, html {
+    font-family:Inter,sans-serif;
+    margin: 0;
+    padding: 0;
+    height: 100%;
+}
+
+/* Navbar Styles */
+.navbar {
+    background-color: var(--surface2);
+    border-bottom: 1px solid var(--text2);
+    position: fixed;
+    width: 100%;
+    top: 0;
+    left: 0;
+    height: 60px; /* Adjust as needed */
+    color: white;
+    display: flex;
+    align-items: center;
+    padding: 0 20px;
+    box-sizing: border-box;
+    z-index: 1000;
+}
+
+
+.navbar__container {
+    display:flex;
+    justify-content:space-between;
+    align-items:center;
+    padding:1rem;
+    color:var(--text1);
+    max-width:2048px;
+    margin:auto
+}
+.navbar__logo {
+    display:flex;
+    align-items:center;
+    height:40px
+}
+.navbar__logo a {
+    display:flex;
+    align-items:center;
+    text-decoration:none;
+    height:100%
+}
+.navbar__logo img {
+    height:100%;
+    width:auto
+}
+.navbar__toggle {
+    background:0 0;
+    color:var(--text2);
+    border:none;
+    cursor:pointer;
+    font-size:1.5rem;
+    width:2.5rem;
+    height:2.5rem;
+    margin-left:auto
+}
+.navbar__toggle:hover {
+    color:var(--text1)
+}
+@media(min-width:769px) {
+    .navbar__toggle {
+        display:none
+    }
+}
+.navbar__menu {
+    display:flex;
+    justify-content:space-between;
+    align-items:center;
+    list-style:none;
+    margin:0;
+    padding:0;
+    gap:.25rem;
+    margin-left:auto
+}
+
+@media(max-width:768px) {
+    .navbar__menu {
+        flex-direction:column;
+        justify-content:flex-start;
+        width:100%;
+        background-color:var(--surface2);
+        position:fixed;
+        top:0;
+        left:0;
+        right:0;
+        bottom:0;
+        padding:1.5rem;
+        transform:translateX(100%);
+        transition:transform .5s ease-in-out
+    }
+}
+@media(max-width:768px) {
+    .navbar__menu.active {
+        transform:translateX(0)
+    }
+}
+.navbar__close {
+    background:0 0;
+    border:none;
+    cursor:pointer;
+    font-size:1.5rem;
+    color:var(--text2);
+    margin-left:auto
+}
+.navbar__close:hover {
+    color:var(--text1)
+}
+
+@media(min-width:769px) {
+    .navbar__close {
+        display:none
+    }
+}
+.navbar__links {
+    display:flex;
+    gap:1rem;
+    align-items:center;
+    margin:0;
+    padding:0
+}
+
+@media(max-width:768px) {
+    .navbar__links {
+        flex-direction:column
+    }
+}
+
+.navbar__item {
+    list-style-type:none
+}
+
+.navbar__link {
+    color:var(--text2);
+    text-decoration:none;
+    padding:.5rem
+}
+
+.navbar__link:hover {
+    color:var(--text1)
+}
+
+.navbar__theme-toggle-button {
+    background:0 0;
+    color:var(--text2);
+    border:none;
+    cursor:pointer;
+    font-size:1.5rem;
+    width:2.5rem;
+    height:2.5rem
+}
+
+.navbar__theme-toggle-button:hover {
+    color:var(--text1)
+}
+
+.hero__container {
+    margin-top:1rem;
+    display:flex;
+    justify-content:center;
+    align-items:center;
+    gap:2rem
+}
+
+.hero__title {
+    font-size:2.5rem;
+    margin-bottom:.5rem
+}
+
+.hero__title-large {
+    font-size:3rem
+}
+
+@media(max-width:768px) {
+    .hero__title-large {
+        font-size:2.5rem
+    }
+}
+
+@media(max-width:480px) {
+    .hero__title-large {
+        font-size:2rem
+    }
+}
+
+@media(max-width:768px) {
+    .hero__title {
+        font-size:2rem
+    }
+}
+
+@media(max-width:480px) {
+    .hero__title {
+        font-size:1.75rem
+    }
+}
+
+.hero__subtitle {
+    font-size:1.25rem;
+    font-weight:500
+}
+
+@media(max-width:768px) {
+    .hero__subtitle {
+        font-size:1rem
+    }
+}
+
+@media(max-width:480px) {
+    .hero__subtitle {
+        font-size:.875rem
+    }
+}
+
+.section-container {
+    max-width: 2048px;
+    margin-left:auto;
+    margin-right:auto;
+    margin-top:0;
+    margin-bottom: 1rem;
+    padding:1rem 2rem
+}
+
+@media(max-width:768px) {
+    .section-container {
+        padding:1rem
+    }
+}
+
+.section-container h2 {
+    font-size:1.5rem;
+    margin-bottom:1rem;
+    color:var(--brand);
+    border-bottom: 1px solid var(--text2);
+}
+
+@media(max-width:768px) {
+    .section-container h2 {
+        font-size:1.25rem
+    }
+}
+
+.section-container p {
+    font-size:1rem;
+    line-height:1.5
+}
+
+@media(max-width:768px) {
+    .section-container p {
+        font-size:.875rem
+    }
+}
+
+.home__row {
+    display:grid;
+    grid-template-columns:repeat(auto-fit,minmax(300px,1fr));
+    gap:2rem
+}
+
+.table-wrapper {
+    display:flex;
+    flex-direction:column;
+    padding:1rem;
+    border-collapse: collapse; /* Ensures there are no gaps between cells */
+}
+
+.table-wrapper th, .table-wrapper td {
+    padding: 0.5rem 1rem; /* Adds padding inside the cells */
+    border:1px solid var(--text1);
+    text-align: left;
+}
+
+.block-command-command {
+    font-weight: bold;
+}
+
+.code-clang-doc {
+    font-size: 1.1rem;
+}
+
+.delimiter-container {
+    padding: 0.5rem 1rem;
+    margin-bottom:1rem; 
+}
+
+.resizer {
+    width: 5px;
+    cursor: col-resize;
+    background-color: var(--text2);
+}
+
+.resizer:hover {
+    background-color: var(--text2-inverse);
+}
+
+.sidebar {
+    width: 250px;
+    top: 0;
+    left: 0;
+    height: 100%;
+    position: fixed;
+    background-color: var(--surface1);
+    display: flex;
+    border-left: 1px solid var(--text2);
+    flex-direction: column;
+    overflow-y: auto;
+    scrollbar-width: thin;
+}
+
+.sidebar h2 {
+    margin-top: 0;
+    margin-bottom: 20px;
+    padding: 10px;
+}
+
+.sidebar ul {
+    width: 100%;
+    padding: 0;
+    list-style-type: none;
+}
+
+.sidebar ul li {
+    padding-right: 1rem;
+    padding-left: 2rem;
+    padding-top: 0.25rem;
+    padding-bottom: 0.25rem;
+}
+
+.sidebar-section {
+    font-size:1.5rem;
+    font-weight: bold;
+    margin-bottom: 1rem;
+    padding: 3rem;
+}
+.sidebar-section a {
+    color: var(--brand)
+}
+
+/* Content */
+.content {
+    background-color: var(--text1-inverse);
+    padding: 20px;
+    left: 250px;
+    position: relative;
+    width: calc(100% - 250px);
+    height: 100vh;
+}
+
+.sidebar-item {
+    color: var(--text1);
+}
+
+.sidebar-item-container:hover {
+    width: 100%;
+    background-color: grey;
+}
+
+.sidebar-item-container:hover a {
+    width: 100%;
+    color: var(--text1-inverse);
+}
+
+.class-container {
+    padding: 0.5rem 1rem;
+}
+
+a, a:visited, a:hover, a:active {
+    text-decoration: none;
+    color: inherit;
+}
diff --git a/clang-tools-extra/clang-doc/assets/class-template.mustache b/clang-tools-extra/clang-doc/assets/class-template.mustache
new file mode 100644
index 0000000..f9e78f5
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/class-template.mustache
@@ -0,0 +1,227 @@
+{{! 
+    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+    See https://llvm.org/LICENSE.txt for license information.
+    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+    
+    This file defines the template for classes/struct
+}}
+<!DOCTYPE html>
+<html lang="en-US">
+<head>
+    <meta charset="utf-8"/>
+    <title>{{Name}}</title>
+    {{#Stylesheets}}
+        <link rel="stylesheet" type="text/css" href="{{.}}"/>
+    {{/Stylesheets}}
+    {{#Scripts}}
+        <script src="{{.}}"></script>
+    {{/Scripts}}
+    {{! Highlight.js dependency for syntax highlighting }}
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/cpp.min.js"></script>
+</head>
+<body>
+<nav class="navbar">
+    <div class="navbar__container">
+        {{#ProjectName}}
+            <div class="navbar__logo">
+                {{ProjectName}}
+            </div>
+        {{/ProjectName}}
+        <div class="navbar__menu">
+            <ul class="navbar__links">
+                <li class="navbar__item">
+                    <a href="/" class="navbar__link">Namespace</a>
+                </li>
+                <li class="navbar__item">
+                    <a href="/" class="navbar__link">Class</a>
+                </li>
+            </ul>
+        </div>
+    </div>
+</nav>
+<main>
+    <div class="container">
+        <div class="sidebar">
+            <h2>{{RecordType}} {{Name}}</h2>
+            <ul>
+                {{#PublicMembers}}
+                <li class="sidebar-section">
+                    <a class="sidebar-item" href="#PublicMethods">Public Members</a>
+                </li>
+                <ul>
+                    {{#Obj}}
+                    <li class="sidebar-item-container">
+                        <a class="sidebar-item" href="#{{Name}}">{{Name}}</a>
+                    </li>
+                    {{/Obj}}
+                </ul>
+                {{/PublicMembers}}
+                {{#ProtectedMembers}}
+                    <li class="sidebar-section">
+                        <a class="sidebar-item" href="#PublicMethods">Protected Members</a>
+                    </li>
+                    <ul>
+                        {{#Obj}}
+                            <li class="sidebar-item-container">
+                                <a class="sidebar-item" href="#{{Name}}">{{Name}}</a>
+                            </li>
+                        {{/Obj}}
+                    </ul>
+                {{/ProtectedMembers}}
+                {{#PublicFunction}}
+                <li class="sidebar-section">
+                    <a class="sidebar-item" href="#PublicMethods">Public Method</a>
+                </li>
+                <ul>
+                    {{#Obj}}
+                    <li class="sidebar-item-container">
+                        <a class="sidebar-item" href="#{{ID}}">{{Name}}</a>
+                    </li>
+                    {{/Obj}}
+                </ul>
+                {{/PublicFunction}}
+                {{#ProtectedFunction}}
+                <li class="sidebar-section">
+                    <a class="sidebar-item" href="#ProtectedFunction">Protected Method</a>
+                </li>
+                <ul>
+                    {{#Obj}}
+                    <li class="sidebar-item-container">
+                        <a class="sidebar-item" href="#{{ID}}">{{Name}}</a>
+                    </li>
+                    {{/Obj}}
+                </ul>
+                {{/ProtectedFunction}}
+                {{#Enums}}
+                <li class="sidebar-section">
+                    <a class="sidebar-item" href="#Enums">Enums</a>
+                </li>
+                <ul>
+                    {{#Obj}}
+                    <li class="sidebar-item-container">
+                        <a class="sidebar-item" href="#{{ID}}">{{EnumName}}</a>
+                    </li>
+                    {{/Obj}}
+                </ul>
+                {{/Enums}}
+                {{#Typedef}}
+                <li class="sidebar-section">Typedef</li>
+                {{/Typedef}}
+                {{#Record}}
+                <li class="sidebar-section">
+                    <a class="sidebar-item" href="#Classes">Inner Classes</a>
+                </li>
+                <ul>
+                    {{#Links}}
+                    <li class="sidebar-item-container">
+                        <a class="sidebar-item" href="#{{ID}}">{{Name}}</a>
+                    </li>
+                    {{/Links}}
+                </ul>
+                {{/Record}}
+            </ul>
+        </div>
+        <div class="resizer" id="resizer"></div>
+        <div class="content">
+            <section class="hero section-container">
+                <div class="hero__title">
+                    <h1 class="hero__title-large">{{RecordType}} {{Name}}</h1>
+                    {{#RecordComments}}
+                    <div class="hero__subtitle">
+                        {{>Comments}}
+                    </div>
+                    {{/RecordComments}}
+                </div>
+            </section>
+            {{#PublicMembers}}
+            <section id="PublicMembers" class="section-container">
+                <h2>Public Members</h2>
+                <div>
+                    {{#Obj}}
+                    <div id="{{Name}}" class="delimiter-container">
+                        <pre>
+<code class="language-cpp code-clang-doc" >{{Type}} {{Name}}</code>
+                        </pre>
+                        {{#MemberComments}}
+                        <div>
+                            {{>Comments}}
+                        </div>
+                        {{/MemberComments}}
+                    </div>
+                    {{/Obj}}
+                </div>
+            </section>    
+            {{/PublicMembers}}
+            {{#ProtectedMembers}}
+            <section id="ProtectedMembers" class="section-container">
+                <h2>Protected Members</h2>
+                <div>
+                    {{#Obj}}
+                    <div id="{{Name}}" class="delimiter-container">
+                        <pre>
+<code class="language-cpp code-clang-doc" >{{Type}} {{Name}}</code>
+                        </pre>
+                        {{#MemberComments}}
+                        <div>
+                            {{>Comments}}
+                        </div>
+                        {{/MemberComments}}
+                    </div>
+                    {{/Obj}}
+                </div>
+            </section>
+            {{/ProtectedMembers}}
+            {{#PublicFunction}}
+            <section id="PublicMethods" class="section-container">
+                <h2>Public Methods</h2>
+                <div>
+                    {{#Obj}}
+{{>FunctionPartial}}
+                    {{/Obj}}
+                </div>
+            </section>
+            {{/PublicFunction}}
+            {{#ProtectedFunction}}
+            <section id="ProtectedFunction" class="section-container">
+                <h2>Protected Methods</h2>
+                <div>
+                    {{#Obj}}
+{{>FunctionPartial}}
+                    {{/Obj}}
+                </div>
+            </section>
+            {{/ProtectedFunction}}
+            {{#Enums}}
+            <section id="Enums" class="section-container">
+                <h2>Enumerations</h2>
+                <div>
+                    {{#Obj}}
+{{>EnumPartial}}
+                    {{/Obj}}
+                </div>
+            </section>
+            {{/Enums}}
+            {{#Record}}
+            <section id="Classes" class="section-container">
+                <h2>Inner Classes</h2>
+                <ul class="class-container">
+                    {{#Links}}
+                    <li id="{{ID}}" style="max-height: 40px;">
+<a href="{{Link}}"><pre><code class="language-cpp code-clang-doc" >class {{Name}}</code></pre></a>
+                    </li>
+                    {{/Links}}
+                </ul>
+            </section>
+            {{/Record}}
+            {{#Typedef}}
+            <section class="section-container">
+                <h2 id="Enums">Enums</h2>
+            </section>
+            {{/Typedef}}
+        </div>
+    </div>
+</main>
+</body>
+</html>
diff --git a/clang-tools-extra/clang-doc/assets/comments-template.mustache b/clang-tools-extra/clang-doc/assets/comments-template.mustache
new file mode 100644
index 0000000..723ace7
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/comments-template.mustache
@@ -0,0 +1,34 @@
+{{!
+    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+    See https://llvm.org/LICENSE.txt for license information.
+    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+    
+    This file defines templates for generating comments
+}}
+{{#FullComment}}
+    {{#Children}}
+    {{>Comments}}
+    {{/Children}}
+{{/FullComment}}
+{{#ParagraphComment}}
+    {{#Children}}
+    {{>Comments}}
+    {{/Children}}
+{{/ParagraphComment}}
+{{#BlockCommandComment}}
+    <div class="block-command-comment__command">
+        <div class="block-command-command">
+            {{Command}}
+        </div>
+        <div>
+            {{#Children}}
+                {{>Comments}}
+            {{/Children}}
+        </div>
+    </div>
+{{/BlockCommandComment}}
+{{#TextComment}}
+    <div>
+        <p>{{TextComment}}</p>
+    </div>
+{{/TextComment}}
diff --git a/clang-tools-extra/clang-doc/assets/enum-template.mustache b/clang-tools-extra/clang-doc/assets/enum-template.mustache
new file mode 100644
index 0000000..c459884
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/enum-template.mustache
@@ -0,0 +1,47 @@
+{{! 
+    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+    See https://llvm.org/LICENSE.txt for license information.
+    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+    
+    This file defines the template for enums
+}}
+<div id="{{ID}}" class="delimiter-container">
+    <div>
+        <pre>
+            <code class="language-cpp code-clang-doc">
+{{EnumName}}
+            </code>
+        </pre>
+    </div>
+    {{! Enum Values }}
+    <table class="table-wrapper">
+        <tbody>
+        <tr>
+            <th>Name</th>
+            <th>Value</th>
+            {{#HasComment}}
+                <th>Comment</th>
+            {{/HasComment}}
+        </tr>
+        {{#EnumValues}}
+            <tr>
+                <td>{{Name}}</td>
+                <td>{{Value}}</td>
+                {{#EnumValueComments}}
+                    <td>{{>Comments}}</td>
+                {{/EnumValueComments}}
+            </tr>
+        {{/EnumValues}}
+        </tbody>
+    </table>
+    {{#EnumComments}}
+    <div>
+        {{>Comments}}
+    </div>
+    {{/EnumComments}}
+    {{#Location}}
+    <div>
+        Defined at line {{LineNumber}} of file {{Filename}}
+    </div>
+    {{/Location}}
+</div>
diff --git a/clang-tools-extra/clang-doc/assets/function-template.mustache b/clang-tools-extra/clang-doc/assets/function-template.mustache
new file mode 100644
index 0000000..86e934a
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/function-template.mustache
@@ -0,0 +1,23 @@
+{{! 
+    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+    See https://llvm.org/LICENSE.txt for license information.
+    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+    This file defines the template for functions/methods
+}}
+<div class="delimiter-container">
+    <div id="{{ID}}">
+        {{! Function Prototype }}
+        <pre>
+            <code class="language-cpp code-clang-doc">
+{{ReturnType.Name}} {{Name}} ({{#Params}}{{^End}}{{Type}} {{Name}}, {{/End}}{{#End}}{{Type}} {{Name}}{{/End}}{{/Params}})
+            </code>
+        </pre>
+        {{! Function Comments }}
+        {{#FunctionComments}}
+        <div>
+            {{>Comments}}
+        </div>
+        {{/FunctionComments}}
+    </div>
+</div>
diff --git a/clang-tools-extra/clang-doc/assets/mustache-index.js b/clang-tools-extra/clang-doc/assets/mustache-index.js
new file mode 100644
index 0000000..0f05eb7
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/mustache-index.js
@@ -0,0 +1,30 @@
+document.addEventListener("DOMContentLoaded", function() {
+  const resizer = document.getElementById('resizer');
+  const sidebar = document.querySelector('.sidebar');
+
+  let isResizing = false;
+  resizer.addEventListener('mousedown', (e) => { isResizing = true; });
+
+  document.addEventListener('mousemove', (e) => {
+    if (!isResizing)
+      return;
+    const newWidth = e.clientX;
+    if (newWidth > 100 && newWidth < window.innerWidth - 100) {
+      sidebar.style.width = `${newWidth}px`;
+    }
+  });
+
+  document.addEventListener('mouseup', () => { isResizing = false; });
+
+  document.querySelectorAll('pre code').forEach((el) => {
+    hljs.highlightElement(el);
+    el.classList.remove("hljs");
+  });
+
+  document.querySelectorAll('.sidebar-item-container').forEach(item => {
+    item.addEventListener('click', function() {
+      const anchor = item.getElementsByTagName("a");
+      window.location.hash = anchor[0].getAttribute('href');
+    });
+  });
+})
diff --git a/clang-tools-extra/clang-doc/assets/namespace-template.mustache b/clang-tools-extra/clang-doc/assets/namespace-template.mustache
new file mode 100644
index 0000000..12dc930
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/namespace-template.mustache
@@ -0,0 +1,47 @@
+{{! 
+    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+    See https://llvm.org/LICENSE.txt for license information.
+    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+    
+    This file defines the template for generating namespaces
+}}
+<!DOCTYPE html>
+<html lang="en-US">
+    <head>
+        <meta charset="utf-8"/>
+        <title>{{NamespaceTitle}}</title>
+        {{#Stylesheets}}
+        <link rel="stylesheet" type="text/css" href="{{.}}"/>
+        {{/Stylesheets}}
+        {{#Scripts}}
+        <script src="{{.}}"></script>
+        {{/Scripts}}
+        {{! Highlight.js dependency for syntax highlighting }}
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css">
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/cpp.min.js"></script>
+    </head>
+    <body>
+        <nav class="navbar">
+            Navbar
+        </nav>
+        <main>
+            <div class="container">
+                <div class="sidebar">
+                        Lorem ipsum dolor sit amet, consectetur adipiscing elit, 
+                        sed do eiusmod tempor incididunt ut labore et dolore magna 
+                        aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco 
+                        laboris nisi ut aliquip ex ea commodo consequat. 
+                        Duis aute irure dolor in reprehenderit in voluptate velit esse 
+                        cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat 
+                        cupidatat non proident, sunt in culpa qui officia deserunt mollit 
+                        anim id est laborum
+                </div>
+                <div class="resizer" id="resizer"></div>
+                <div class="content">
+                    Content
+                </div>
+            </div>
+        </main>
+    </body>
+</html>
diff --git a/clang-tools-extra/clang-doc/assets/template.mustache b/clang-tools-extra/clang-doc/assets/template.mustache
new file mode 100644
index 0000000..18059e6
--- /dev/null
+++ b/clang-tools-extra/clang-doc/assets/template.mustache
@@ -0,0 +1,52 @@
+{{! 
+    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+    See https://llvm.org/LICENSE.txt for license information.
+    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+    
+    This file defines the template for generating Namespaces
+}}
+<!DOCTYPE html>
+<html lang="en-US">
+    <head>
+        <meta charset="utf-8"/>
+        <title>{{NamespaceTitle}}</title>
+    </head>
+    <h1>{{NamespaceTitle}}</h1>
+    {{#NamespaceComments}}
+    <p>Namespace Comment</p>
+    {{/NamespaceComments}}
+    {{#Namespace}}
+    <h2 id="Namespace">Namespace</h2>
+    <ul>
+        {{#Links}}
+        <li>
+            <a href="{{Link}}">{{Name}}</a>
+        </li>
+        {{/Links}}
+    </ul>
+    {{/Namespace}}
+    {{#Record}}
+    <h2 id="Class">Class</h2>
+    <ul>
+        {{#Links}}
+        <li>
+            <a href="{{Link}}">{{Name}}</a>
+        </li>
+        {{/Links}}
+    </ul>
+    {{/Record}}
+    {{#Function}}
+    <h2 id="Function">Function</h2>
+    <div>
+        {{#Obj}}
+        {{/Obj}}
+    </div>
+    {{/Function}}
+    {{#Enums}}
+    <h2 id="Enums">Enums</h2>
+    <div>
+        {{#Obj}}
+        {{/Obj}}
+    </div>
+    {{/Enums}}
+</html>
diff --git a/clang-tools-extra/clang-doc/tool/CMakeLists.txt b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
index 601a046..e359beb 100644
--- a/clang-tools-extra/clang-doc/tool/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
@@ -21,7 +21,15 @@ target_link_libraries(clang-doc
 
 set(assets
   index.js
+  mustache-index.js
   clang-doc-default-stylesheet.css
+  clang-doc-mustache.css
+  class-template.mustache
+  comments-template.mustache
+  enum-template.mustache
+  function-template.mustache
+  namespace-template.mustache
+  template.mustache
 )
 
 set(asset_dir "${CMAKE_CURRENT_SOURCE_DIR}/../assets")
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
index bbb3522..cf29960 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
@@ -92,19 +92,15 @@ void MissingStdForwardCheck::registerMatchers(MatchFinder *Finder) {
                                   declRefExpr(to(equalsBoundNode("param"))))));
   auto RefToParm = capturesVar(
       varDecl(anyOf(hasSameNameAsBoundNode("param"), RefToParmImplicit)));
-  auto HasRefToParm = hasAnyCapture(RefToParm);
 
   auto CaptureInRef =
       allOf(hasCaptureDefaultKind(LambdaCaptureDefault::LCD_ByRef),
             unless(hasAnyCapture(
                 capturesVar(varDecl(hasSameNameAsBoundNode("param"))))));
-  auto CaptureInCopy = allOf(
-      hasCaptureDefaultKind(LambdaCaptureDefault::LCD_ByCopy), HasRefToParm);
   auto CaptureByRefExplicit = hasAnyCapture(
       allOf(hasCaptureKind(LambdaCaptureKind::LCK_ByRef), RefToParm));
 
-  auto CapturedInBody =
-      lambdaExpr(anyOf(CaptureInRef, CaptureInCopy, CaptureByRefExplicit));
+  auto CapturedInBody = lambdaExpr(anyOf(CaptureInRef, CaptureByRefExplicit));
   auto CapturedInCaptureList = hasAnyCapture(capturesVar(
       varDecl(hasInitializer(ignoringParenImpCasts(equalsBoundNode("call"))))));
 
diff --git a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
index 27f4c81..03e6576 100644
--- a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
+++ b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
@@ -545,8 +545,8 @@ void func() {
   EXPECT_TRUE(Preamble);
   EXPECT_TRUE(Preamble->RequiredModules);
 
-  auto Result = signatureHelp(getFullPath("Use.cpp"), Test.point(),
-                              *Preamble.get(), Use, MarkupKind::PlainText);
+  auto Result = signatureHelp(getFullPath("Use.cpp"), Test.point(), *Preamble,
+                              Use, MarkupKind::PlainText);
   EXPECT_FALSE(Result.signatures.empty());
   EXPECT_EQ(Result.signatures[0].label, "printA(int a) -> void");
   EXPECT_EQ(Result.signatures[0].parameters[0].labelString, "int a");
diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index ef57b0f..576e863 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -290,7 +290,7 @@ std::error_code ModularizeUtilities::loadModuleMap(
     Target.get(), *HeaderInfo));
 
   // Parse module.modulemap file into module map.
-  if (ModMap->loadModuleMapFile(ModuleMapEntry, false, Dir)) {
+  if (ModMap->parseAndLoadModuleMapFile(ModuleMapEntry, false, Dir)) {
     return std::error_code(1, std::generic_category());
   }
 
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index c3f30e2..f12712f 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -183,18 +183,17 @@ check_include_file(sys/resource.h CLANG_HAVE_RLIMITS)
 # This check requires _GNU_SOURCE on linux
 check_include_file(dlfcn.h CLANG_HAVE_DLFCN_H)
 if( CLANG_HAVE_DLFCN_H )
+  include(CMakePushCheckState)
   include(CheckLibraryExists)
   include(CheckSymbolExists)
   check_library_exists(dl dlopen "" HAVE_LIBDL)
+  cmake_push_check_state()
   if( HAVE_LIBDL )
     list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
   endif()
   list(APPEND CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
   check_symbol_exists(dladdr dlfcn.h CLANG_HAVE_DLADDR)
-  list(REMOVE_ITEM CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
-  if( HAVE_LIBDL )
-    list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES dl)
-  endif()
+  cmake_pop_check_state()
 endif()
 
 set(CLANG_RESOURCE_DIR "" CACHE STRING
diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst
index dfe86a0..8dfa169 100644
--- a/clang/Maintainers.rst
+++ b/clang/Maintainers.rst
@@ -51,6 +51,14 @@ Clang LLVM IR generation
 | Anton Korobeynikov
 | anton\@korobeynikov.info (email), asl (Phabricator), asl (GitHub)
 
+Clang MLIR generation
+~~~~~~~~~~~~~~~~~~~~~
+| Andy Kaylor
+| akaylor\@nvidia.com (email), AndyKaylor (Discord), AndyKaylor (GitHub)
+
+| Bruno Cardoso Lopes
+| bruno.cardoso\@gmail.com (email), sonicsprawl (Discord), bcardosolopes (GitHub)
+
 
 Analysis & CFG
 ~~~~~~~~~~~~~~
diff --git a/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake b/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake
new file mode 100644
index 0000000..ecd478a
--- /dev/null
+++ b/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake
@@ -0,0 +1,45 @@
+# This file sets up a CMakeCache for the second stage of a Fuchsia toolchain build.
+
+include(${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake)
+
+if(NOT APPLE)
+  set(BOOTSTRAP_LLVM_ENABLE_LLD ON CACHE BOOL "")
+endif()
+
+set(CLANG_BOOTSTRAP_TARGETS
+  check-all
+  check-clang
+  check-lld
+  check-llvm
+  check-polly
+  clang
+  clang-test-depends
+  toolchain-distribution
+  install-toolchain-distribution
+  install-toolchain-distribution-stripped
+  install-toolchain-distribution-toolchain
+  lld-test-depends
+  llvm-config
+  llvm-test-depends
+  test-depends
+  test-suite CACHE STRING "")
+
+get_cmake_property(variableNames VARIABLES)
+foreach(variableName ${variableNames})
+  if(variableName MATCHES "^STAGE2_")
+    string(REPLACE "STAGE2_" "" new_name ${variableName})
+    list(APPEND EXTRA_ARGS "-D${new_name}=${${variableName}}")
+  endif()
+endforeach()
+
+set(CLANG_PGO_TRAINING_DEPS
+  builtins
+  runtimes
+  CACHE STRING "")
+
+# Setup the bootstrap build.
+set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
+set(CLANG_BOOTSTRAP_CMAKE_ARGS
+  ${EXTRA_ARGS}
+  -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake
+  CACHE STRING "")
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 8333658..ee1d681 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -126,6 +126,16 @@ else()
   set(LIBCXX_ENABLE_STATIC_ABI_LIBRARY ON CACHE BOOL "")
   set(LIBCXX_HARDENING_MODE "none" CACHE STRING "")
   set(LIBCXX_USE_COMPILER_RT ON CACHE BOOL "")
+  set(COMPILER_RT_BUILD_LIBFUZZER OFF CACHE BOOL "")
+  set(COMPILER_RT_BUILD_PROFILE ON CACHE BOOL "")
+  set(COMPILER_RT_BUILD_SANITIZERS OFF CACHE BOOL "")
+  set(COMPILER_RT_BUILD_XRAY OFF CACHE BOOL "")
+  set(COMPILER_RT_USE_BUILTINS_LIBRARY ON CACHE BOOL "")
+  set(COMPILER_RT_DEFAULT_TARGET_ONLY ON CACHE BOOL "")
+  set(SANITIZER_CXX_ABI "libc++" CACHE STRING "")
+  set(SANITIZER_CXX_ABI_INTREE ON CACHE BOOL "")
+  set(SANITIZER_TEST_CXX "libc++" CACHE STRING "")
+  set(SANITIZER_TEST_CXX_INTREE ON CACHE BOOL "")
   set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
   set(RUNTIMES_CMAKE_ARGS "-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13;-DCMAKE_OSX_ARCHITECTURES=arm64|x86_64" CACHE STRING "")
 endif()
@@ -165,33 +175,59 @@ endif()
 set(BOOTSTRAP_LLVM_ENABLE_LLD ON CACHE BOOL "")
 set(BOOTSTRAP_LLVM_ENABLE_LTO ON CACHE BOOL "")
 
-set(_FUCHSIA_BOOTSTRAP_TARGETS
-  check-all
-  check-clang
-  check-lld
-  check-llvm
-  check-polly
-  llvm-config
-  clang-test-depends
-  lld-test-depends
-  llvm-test-depends
-  test-suite
-  test-depends
-  toolchain-distribution
-  install-toolchain-distribution
-  install-toolchain-distribution-stripped
-  install-toolchain-distribution-toolchain
-  clang)
-
-if(FUCHSIA_ENABLE_LLDB)
-  list(APPEND _FUCHSIA_ENABLE_PROJECTS lldb)
-  list(APPEND _FUCHSIA_BOOTSTRAP_TARGETS
-    check-lldb
-    lldb-test-depends
-    debugger-distribution
-    install-debugger-distribution
-    install-debugger-distribution-stripped
-    install-debugger-distribution-toolchain)
+if(FUCHSIA_ENABLE_PGO)
+  set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "")
+
+  set(_FUCHSIA_BOOTSTRAP_TARGETS
+    generate-profdata
+    stage2
+    stage2-toolchain-distribution
+    stage2-install-toolchain-distribution
+    stage2-install-toolchain-distribution-stripped
+    stage2-install-toolchain-distribution-toolchain
+    stage2-check-all
+    stage2-check-clang
+    stage2-check-lld
+    stage2-check-llvm
+    stage2-check-polly
+    stage2-test-suite)
+  if(FUCHSIA_ENABLE_LLDB)
+    list(APPEND _FUCHSIA_ENABLE_PROJECTS lldb)
+    list(APPEND _FUCHSIA_BOOTSTRAP_TARGETS
+      stage2-check-lldb
+      stage2-debugger-distribution
+      stage2-install-debugger-distribution
+      stage2-install-debugger-distribution-stripped
+      stage2-install-debugger-distribution-toolchain)
+  endif()
+else()
+ set(_FUCHSIA_BOOTSTRAP_TARGETS
+   check-all
+   check-clang
+   check-lld
+   check-llvm
+   check-polly
+   llvm-config
+   clang
+   clang-test-depends
+   lld-test-depends
+   llvm-test-depends
+   test-suite
+   test-depends
+   toolchain-distribution
+   install-toolchain-distribution
+   install-toolchain-distribution-stripped
+   install-toolchain-distribution-toolchain)
+ if(FUCHSIA_ENABLE_LLDB)
+   list(APPEND _FUCHSIA_ENABLE_PROJECTS lldb)
+   list(APPEND _FUCHSIA_BOOTSTRAP_TARGETS
+     check-lldb
+     lldb-test-depends
+     debugger-distribution
+     install-debugger-distribution
+     install-debugger-distribution-stripped
+     install-debugger-distribution-toolchain)
+ endif()
 endif()
 
 set(LLVM_ENABLE_PROJECTS ${_FUCHSIA_ENABLE_PROJECTS} CACHE STRING "")
@@ -200,6 +236,7 @@ set(CLANG_BOOTSTRAP_TARGETS ${_FUCHSIA_BOOTSTRAP_TARGETS} CACHE STRING "")
 get_cmake_property(variableNames VARIABLES)
 foreach(variableName ${variableNames})
   if(variableName MATCHES "^STAGE2_")
+    list(APPEND EXTRA_ARGS "-D${variableName}=${${variableName}}")
     string(REPLACE "STAGE2_" "" new_name ${variableName})
     string(REPLACE ";" "|" value "${${variableName}}")
     list(APPEND EXTRA_ARGS "-D${new_name}=${value}")
@@ -209,13 +246,23 @@ endforeach()
 # TODO: This is a temporary workaround until we figure out the right solution.
 set(BOOTSTRAP_LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
 
+set(LLVM_BUILTIN_TARGETS "default" CACHE STRING "")
+set(LLVM_RUNTIME_TARGETS "default" CACHE STRING "")
+
 # Setup the bootstrap build.
 set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
 set(CLANG_BOOTSTRAP_EXTRA_DEPS
   builtins
   runtimes
   CACHE STRING "")
-set(CLANG_BOOTSTRAP_CMAKE_ARGS
-  ${EXTRA_ARGS}
-  -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake
-  CACHE STRING "")
+if(FUCHSIA_ENABLE_PGO)
+  set(CLANG_BOOTSTRAP_CMAKE_ARGS
+    ${EXTRA_ARGS}
+    -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2-instrumented.cmake
+    CACHE STRING "")
+else()
+  set(CLANG_BOOTSTRAP_CMAKE_ARGS
+    ${EXTRA_ARGS}
+    -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake
+    CACHE STRING "")
+endif()
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index b2ac538..dfc2708 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -518,7 +518,7 @@ Predefined Macros
    * - ``__HIPSTDPAR__``
      - Defined when Clang is compiling code in algorithm offload mode, enabled
        with the ``--hipstdpar`` compiler option.
-   * - ``__HIPSTDPAR_INTERPOSE_ALLOC__``
+   * - ``__HIPSTDPAR_INTERPOSE_ALLOC__`` / ``__HIPSTDPAR_INTERPOSE_ALLOC_V1__``
      - Defined only when compiling in algorithm offload mode, when the user
        enables interposition mode with the ``--hipstdpar-interpose-alloc``
        compiler option, indicating that all dynamic memory allocation /
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index ebcad44..f56f2a6 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1859,12 +1859,18 @@ The following type trait primitives are supported by Clang. Those traits marked
 * ``__is_trivially_constructible`` (C++, GNU, Microsoft)
 * ``__is_trivially_copyable`` (C++, GNU, Microsoft)
 * ``__is_trivially_destructible`` (C++, MSVC 2013)
-* ``__is_trivially_relocatable`` (Clang): Returns true if moving an object
+* ``__is_trivially_relocatable`` (Clang) (Deprecated,
+  use ``__builtin_is_cpp_trivially_relocatable`` instead).
+  Returns true if moving an object
   of the given type, and then destroying the source object, is known to be
   functionally equivalent to copying the underlying bytes and then dropping the
   source object on the floor. This is true of trivial types,
   C++26 relocatable types, and types which
   were made trivially relocatable via the ``clang::trivial_abi`` attribute.
+  This trait is deprecated and should be replaced by
+  ``__builtin_is_cpp_trivially_relocatable``. Note however that it is generally
+  unsafe to relocate a C++-relocatable type with ``memcpy`` or ``memmove``;
+  use ``__builtin_trivially_relocate``.
 * ``__builtin_is_cpp_trivially_relocatable`` (C++): Returns true if an object
   is trivially relocatable, as defined by the C++26 standard [meta.unary.prop].
   Note that when relocating the caller code should ensure that if the object is polymorphic,
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f40ebe9..4c25d6d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -434,9 +434,9 @@ Improvements to Clang's diagnostics
 - The ``-Wsign-compare`` warning now treats expressions with bitwise not(~) and minus(-) as signed integers
   except for the case where the operand is an unsigned integer
   and throws warning if they are compared with unsigned integers (##18878).
-- The ``-Wunnecessary-virtual-specifier`` warning has been added to warn about
-  methods which are marked as virtual inside a ``final`` class, and hence can
-  never be overridden.
+- The ``-Wunnecessary-virtual-specifier`` warning (included in ``-Wextra``) has
+  been added to warn about methods which are marked as virtual inside a
+  ``final`` class, and hence can never be overridden.
 
 - Improve the diagnostics for chained comparisons to report actual expressions and operators (#GH129069).
 
@@ -505,6 +505,9 @@ Improvements to Clang's diagnostics
 - ``-Wreserved-identifier`` now fires on reserved parameter names in a function
   declaration which is not a definition.
 
+- Several compatibility diagnostics that were incorrectly being grouped under
+  ``-Wpre-c++20-compat`` are now part of ``-Wc++20-compat``. (#GH138775)
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -574,6 +577,15 @@ Bug Fixes to Compiler Builtins
 - ``__has_unique_object_representations(Incomplete[])`` is no longer accepted, per
   `LWG4113 <https://cplusplus.github.io/LWG/issue4113>`_.
 
+- ``__builtin_is_cpp_trivially_relocatable``, ``__builtin_is_replaceable`` and
+  ``__builtin_trivially_relocate`` have been added to support standard C++26 relocation.
+
+- ``__is_trivially_relocatable`` has been deprecated, and uses should be replaced by
+  ``__builtin_is_cpp_trivially_relocatable``.
+  Note that, it is generally unsafe to ``memcpy`` non-trivially copyable types that
+  are ``__builtin_is_cpp_trivially_relocatable``. It is recommended to use
+  ``__builtin_trivially_relocate`` instead.
+
 Bug Fixes to Attribute Support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  - Fixed crash when a parameter to the ``clang::annotate`` attribute evaluates to ``void``. See #GH119125
@@ -658,6 +670,10 @@ Bug Fixes to C++ Support
   referred to a reference to an incomplete type. (#GH129397)
 - Fixed a crash when a cast involved a parenthesized aggregate initialization in dependent context. (#GH72880)
 - Fixed a crash when forming an invalid function type in a dependent context. (#GH138657) (#GH115725) (#GH68852)
+- No longer crashes when instantiating invalid variable template specialization
+  whose type depends on itself. (#GH51347), (#GH55872)
+- Improved parser recovery of invalid requirement expressions. In turn, this
+  fixes crashes from follow-on processing of the invalid requirement. (#GH138820)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/SanitizerCoverage.rst b/clang/docs/SanitizerCoverage.rst
index f952198..23720e5 100644
--- a/clang/docs/SanitizerCoverage.rst
+++ b/clang/docs/SanitizerCoverage.rst
@@ -385,6 +385,50 @@ Users need to implement a single function to capture the CF table at startup:
     // the collected control flow.
   }
 
+Tracing Stack Depth
+===================
+
+With ``-fsanitize-coverage=stack-depth`` the compiler will track how much
+stack space has been used for a function call chain. Leaf functions are
+not included in this tracing.
+
+The maximum depth of a function call graph is stored in the thread-local
+``__sancov_lowest_stack`` variable. Instrumentation is inserted in every
+non-leaf function to check the frame pointer against this variable,
+and if it is lower, store the current frame pointer. This effectively
+inserts the following:
+
+.. code-block:: c++
+
+  extern thread_local uintptr_t __sancov_lowest_stack;
+
+  uintptr_t stack = (uintptr_t)__builtin_frame_address(0);
+  if (stack < __sancov_lowest_stack)
+    __sancov_lowest_stack = stack;
+
+If ``-fsanitize-coverage-stack-depth-callback-min=N`` (where
+``N > 0``) is also used, the tracking is delegated to a callback,
+``__sanitizer_cov_stack_depth``, instead of adding instrumentation to
+update ``__sancov_lowest_stack``. The ``N`` of the argument is used
+to determine which functions to instrument. Only functions estimated
+to be using ``N`` bytes or more of stack space will be instrumented to
+call the tracing callback. In the case of a dynamically sized stack,
+the callback is unconditionally added.
+
+The callback takes no arguments and is responsible for determining
+the stack usage and doing any needed comparisons and storage. A roughly
+equivalent implementation of ``__sancov_lowest_stack`` using the callback
+would look like this:
+
+.. code-block:: c++
+
+  void __sanitizer_cov_stack_depth(void) {
+    uintptr_t stack = (uintptr_t)__builtin_frame_address(0);
+
+    if (stack < __sancov_lowest_stack)
+      __sancov_lowest_stack = stack;
+  }
+
 Gated Trace Callbacks
 =====================
 
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index 9c01978..0f2e496 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -737,6 +737,7 @@ public:
   std::optional<std::string> SwiftImportAs;
   std::optional<std::string> SwiftRetainOp;
   std::optional<std::string> SwiftReleaseOp;
+  std::optional<std::string> SwiftDefaultOwnership;
 
   /// The Swift protocol that this type should be automatically conformed to.
   std::optional<std::string> SwiftConformance;
@@ -786,6 +787,8 @@ public:
       SwiftRetainOp = RHS.SwiftRetainOp;
     if (!SwiftReleaseOp)
       SwiftReleaseOp = RHS.SwiftReleaseOp;
+    if (!SwiftDefaultOwnership)
+      SwiftDefaultOwnership = RHS.SwiftDefaultOwnership;
 
     if (!SwiftConformance)
       SwiftConformance = RHS.SwiftConformance;
@@ -815,6 +818,7 @@ inline bool operator==(const TagInfo &LHS, const TagInfo &RHS) {
          LHS.SwiftImportAs == RHS.SwiftImportAs &&
          LHS.SwiftRetainOp == RHS.SwiftRetainOp &&
          LHS.SwiftReleaseOp == RHS.SwiftReleaseOp &&
+         LHS.SwiftDefaultOwnership == RHS.SwiftDefaultOwnership &&
          LHS.SwiftConformance == RHS.SwiftConformance &&
          LHS.isFlagEnum() == RHS.isFlagEnum() &&
          LHS.isSwiftCopyable() == RHS.isSwiftCopyable() &&
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 8bc2406..e107db4 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2970,6 +2970,11 @@ public:
   TemplateTemplateParmDecl *insertCanonicalTemplateTemplateParmDeclInternal(
       TemplateTemplateParmDecl *CanonTTP) const;
 
+  /// Determine whether the given template arguments \p Arg1 and \p Arg2 are
+  /// equivalent.
+  bool isSameTemplateArgument(const TemplateArgument &Arg1,
+                              const TemplateArgument &Arg2) const;
+
   /// Type Query functions.  If the type is an instance of the specified class,
   /// return the Type pointer for the underlying maximally pretty type.  This
   /// is a member of ASTContext because this may need to do some amount of
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 9279720..452b1e3 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -305,6 +305,7 @@ CODEGENOPT(SanitizeCoveragePCTable, 1, 0) ///< Create a PC Table.
 CODEGENOPT(SanitizeCoverageControlFlow, 1, 0) ///< Collect control flow
 CODEGENOPT(SanitizeCoverageNoPrune, 1, 0) ///< Disable coverage pruning.
 CODEGENOPT(SanitizeCoverageStackDepth, 1, 0) ///< Enable max stack depth tracing
+VALUE_CODEGENOPT(SanitizeCoverageStackDepthCallbackMin , 32, 0) ///< Enable stack depth tracing callbacks.
 CODEGENOPT(SanitizeCoverageTraceLoads, 1, 0) ///< Enable tracing of loads.
 CODEGENOPT(SanitizeCoverageTraceStores, 1, 0) ///< Enable tracing of stores.
 CODEGENOPT(SanitizeBinaryMetadataCovered, 1, 0) ///< Emit PCs for covered functions.
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index e39a73b..e3fa6a5 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -399,6 +399,10 @@ public:
   /// (0.0 [default] to skip none, 1.0 to skip all).
   SanitizerMaskCutoffs SanitizeSkipHotCutoffs;
 
+  /// Set of sanitizer checks, for which the instrumentation will be annotated
+  /// with extra debug info.
+  SanitizerSet SanitizeAnnotateDebugInfo;
+
   /// List of backend command-line options for -fembed-bitcode.
   std::vector<uint8_t> CmdArgs;
 
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f26c906..e4d94fe 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -130,9 +130,11 @@ def err_attribute_not_type_attr : Error<
   "%0%select{ attribute|}1 cannot be applied to types">;
 def err_enum_template : Error<"enumeration cannot be a template">;
 
-def warn_cxx20_compat_consteval : Warning<
-  "'consteval' specifier is incompatible with C++ standards before C++20">,
-  InGroup<CXX20Compat>, DefaultIgnore;
+def warn_cxx20_compat_consteval
+    : Warning<"'consteval' specifier is incompatible with C++ standards before "
+              "C++20">,
+      InGroup<CXXPre20Compat>,
+      DefaultIgnore;
 def warn_missing_type_specifier : Warning<
   "type specifier missing, defaults to 'int'">,
   InGroup<ImplicitInt>, DefaultIgnore;
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 1faf850..7b0dcde 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -421,13 +421,12 @@ def CXX11WarnSuggestOverride : DiagGroup<"suggest-override">;
 def WarnUnnecessaryVirtualSpecifier : DiagGroup<"unnecessary-virtual-specifier"> {
   code Documentation = [{
 Warns when a ``final`` class contains a virtual method (including virtual
-destructors). Since ``final`` classes cannot be subclassed, their methods
-cannot be overridden, and hence the ``virtual`` specifier is useless.
+destructors) that does not override anything. Since ``final`` classes cannot
+be subclassed, their methods cannot be overridden, so there is no point to
+introducing new ``virtual`` methods.
 
 The warning also detects virtual methods in classes whose destructor is
 ``final``, for the same reason.
-
-The warning does not fire on virtual methods which are also marked ``override``.
   }];
 }
 
@@ -621,6 +620,7 @@ def ModuleImport : DiagGroup<"module-import">;
 def ModuleConflict : DiagGroup<"module-conflict">;
 def ModuleFileExtension : DiagGroup<"module-file-extension">;
 def ModuleIncludeDirectiveTranslation : DiagGroup<"module-include-translation">;
+def ModuleMap : DiagGroup<"module-map">;
 def RoundTripCC1Args : DiagGroup<"round-trip-cc1-args">;
 def NewlineEOF : DiagGroup<"newline-eof">;
 def Nullability : DiagGroup<"nullability">;
@@ -1163,6 +1163,7 @@ def Extra : DiagGroup<"extra", [
     FUseLdPath,
     CastFunctionTypeMismatch,
     InitStringTooLongMissingNonString,
+    WarnUnnecessaryVirtualSpecifier,
   ]>;
 
 def Most : DiagGroup<"most", [
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index a3c3210..723f5d4 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -841,6 +841,12 @@ def warn_pp_date_time : Warning<
   ShowInSystemHeader, DefaultIgnore, InGroup<DiagGroup<"date-time">>;
 
 // Module map parsing
+def remark_mmap_parse : Remark<
+  "parsing modulemap '%0'">, ShowInSystemHeader, InGroup<ModuleMap>;
+def remark_mmap_load : Remark<
+  "loading modulemap '%0'">, ShowInSystemHeader, InGroup<ModuleMap>;
+def remark_mmap_load_module : Remark<
+  "loading parsed module '%0'">, ShowInSystemHeader, InGroup<ModuleMap>;
 def err_mmap_unknown_token : Error<"skipping stray token">;
 def err_mmap_expected_module : Error<"expected module declaration">;
 def err_mmap_expected_module_name : Error<"expected module name">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index be76863..e1b9ed0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -51,6 +51,8 @@ defm adl_only_template_id : CXX20Compat<
   "with explicit template arguments is">;
 defm ctad_for_alias_templates
     : CXX20Compat<"class template argument deduction for alias templates is">;
+defm implicit_typename
+    : CXX20Compat<"missing 'typename' prior to dependent type name %0 is">;
 
 // C++23 compatibility with C++20 and earlier.
 defm constexpr_static_var : CXX23Compat<
@@ -2526,9 +2528,14 @@ def note_implicit_deduction_guide : Note<"implicit deduction guide declared as '
 def warn_cxx98_compat_auto_type_specifier : Warning<
   "'auto' type specifier is incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
-def err_auto_variable_cannot_appear_in_own_initializer : Error<
-  "variable %0 declared with deduced type %1 "
-  "cannot appear in its own initializer">;
+def err_auto_variable_cannot_appear_in_own_initializer
+    : Error<
+          "%enum_select<ParsingInitFor>{%Var{variable}|"
+          "%VarTemplate{variable template}|"
+          "%VarTemplatePartialSpec{variable template partial specialization}|"
+          "%VarTemplateExplicitSpec{variable template explicit "
+          "specialization}}0 %1 "
+          "declared with deduced type %2 cannot appear in its own initializer">;
 def err_binding_cannot_appear_in_own_initializer : Error<
   "binding %0 cannot appear in the initializer of its own "
   "decomposition declaration">;
@@ -5302,6 +5309,9 @@ def err_template_member_noparams : Error<
   "extraneous 'template<>' in declaration of member %0">;
 def err_template_tag_noparams : Error<
   "extraneous 'template<>' in declaration of %0 %1">;
+def err_var_template_spec_type_depends_on_self : Error<
+  "the type of variable template specialization %0 declared with deduced type "
+  "%1 depends on itself">;
 
 def warn_unqualified_call_to_std_cast_function : Warning<
   "unqualified call to '%0'">, InGroup<DiagGroup<"unqualified-std-cast-call">>;
@@ -5859,16 +5869,8 @@ def ext_typename_missing
 def err_typename_refers_to_using_value_decl : Error<
   "typename specifier refers to a dependent using declaration for a value "
   "%0 in %1">;
-def note_using_value_decl_missing_typename : Note<
-  "add 'typename' to treat this using declaration as a type">;
-def warn_cxx17_compat_implicit_typename : Warning<"use of implicit 'typename' is "
-  "incompatible with C++ standards before C++20">, InGroup<CXX20Compat>,
-  DefaultIgnore;
-def ext_implicit_typename
-    : ExtWarn<"missing 'typename' prior to dependent "
-              "type name %0; implicit 'typename' is a C++20 extension">,
-      InGroup<CXX20>;
-
+def note_using_value_decl_missing_typename
+    : Note<"add 'typename' to treat this using declaration as a type">;
 def err_template_kw_refers_to_non_template : Error<
   "%0%select{| following the 'template' keyword}1 "
   "does not refer to a template">;
@@ -9564,9 +9566,11 @@ def err_incomplete_type_used_in_type_trait_expr : Error<
   "incomplete type %0 used in type trait expression">, NoSFINAE;
 
 // C++20 constinit and require_constant_initialization attribute
-def warn_cxx20_compat_constinit : Warning<
-  "'constinit' specifier is incompatible with C++ standards before C++20">,
-  InGroup<CXX20Compat>, DefaultIgnore;
+def warn_cxx20_compat_constinit
+    : Warning<"'constinit' specifier is incompatible with C++ standards before "
+              "C++20">,
+      InGroup<CXXPre20Compat>,
+      DefaultIgnore;
 def err_constinit_local_variable : Error<
   "local variable cannot be declared 'constinit'">;
 def err_require_constant_init_failed : Error<
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 9bc6368..94e72fe 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -544,7 +544,6 @@ TYPE_TRAIT_2(__is_pointer_interconvertible_base_of, IsPointerInterconvertibleBas
 #include "clang/Basic/TransformTypeTraits.def"
 
 // Clang-only C++ Type Traits
-TYPE_TRAIT_1(__is_trivially_relocatable, IsTriviallyRelocatable, KEYCXX)
 TYPE_TRAIT_1(__is_trivially_equality_comparable, IsTriviallyEqualityComparable, KEYCXX)
 TYPE_TRAIT_1(__is_bounded_array, IsBoundedArray, KEYCXX)
 TYPE_TRAIT_1(__is_unbounded_array, IsUnboundedArray, KEYCXX)
@@ -556,8 +555,11 @@ TYPE_TRAIT_2(__reference_converts_from_temporary, ReferenceConvertsFromTemporary
 // IsDeducible is only used internally by clang for CTAD implementation and
 // is not exposed to users.
 TYPE_TRAIT_2(/*EmptySpellingName*/, IsDeducible, KEYCXX)
-TYPE_TRAIT_1(__is_bitwise_cloneable, IsBitwiseCloneable, KEYALL)
+
+// __is_trivially_relocatable is deprecated
 TYPE_TRAIT_1(__builtin_is_cpp_trivially_relocatable, IsCppTriviallyRelocatable, KEYCXX)
+TYPE_TRAIT_1(__is_trivially_relocatable, IsTriviallyRelocatable, KEYCXX)
+TYPE_TRAIT_1(__is_bitwise_cloneable, IsBitwiseCloneable, KEYALL)
 TYPE_TRAIT_1(__builtin_is_replaceable, IsReplaceable, KEYCXX)
 TYPE_TRAIT_1(__builtin_structured_binding_size, StructuredBindingSize, KEYCXX)
 
diff --git a/clang/include/clang/CIR/CIRToCIRPasses.h b/clang/include/clang/CIR/CIRToCIRPasses.h
index 361ebb9..4a23790 100644
--- a/clang/include/clang/CIR/CIRToCIRPasses.h
+++ b/clang/include/clang/CIR/CIRToCIRPasses.h
@@ -32,7 +32,8 @@ namespace cir {
 mlir::LogicalResult runCIRToCIRPasses(mlir::ModuleOp theModule,
                                       mlir::MLIRContext &mlirCtx,
                                       clang::ASTContext &astCtx,
-                                      bool enableVerifier);
+                                      bool enableVerifier,
+                                      bool enableCIRSimplify);
 
 } // namespace cir
 
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index d96a139..a63bf4f 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -111,7 +111,7 @@ public:
   cir::BoolType getBoolTy() { return cir::BoolType::get(getContext()); }
 
   cir::PointerType getPointerTo(mlir::Type ty) {
-    return cir::PointerType::get(getContext(), ty);
+    return cir::PointerType::get(ty);
   }
 
   cir::PointerType getVoidPtrTy() {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index 73759cf..52e32ee 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -27,6 +27,13 @@ def CIR_Dialect : Dialect {
   let useDefaultAttributePrinterParser = 0;
   let useDefaultTypePrinterParser = 0;
 
+  // Enable constant materialization for the CIR dialect. This generates a
+  // declaration for the cir::CIRDialect::materializeConstant function. This
+  // hook is necessary for canonicalization to properly handle attributes
+  // returned by fold methods, allowing them to be materialized as constant
+  // operations in the IR.
+  let hasConstantMaterializer = 1;
+
   let extraClassDeclaration = [{
     static llvm::StringRef getTripleAttrName() { return "cir.triple"; }
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 422c89c..7ffa104 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -384,8 +384,6 @@ def AllocaOp : CIR_Op<"alloca", [
     `]`
     ($annotations^)? attr-dict
   }];
-
-  let hasVerifier = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1464,6 +1462,8 @@ def SelectOp : CIR_Op<"select", [Pure,
       qualified(type($false_value))
     `)` `->` qualified(type($result)) attr-dict
   }];
+
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1512,9 +1512,6 @@ def TernaryOp : CIR_Op<"ternary",
       >
   ];
 
-  // All constraints already verified elsewhere.
-  let hasVerifier = 0;
-
   let assemblyFormat = [{
     `(` $cond `,`
       `true` $trueRegion `,`
@@ -1652,9 +1649,6 @@ def GetGlobalOp : CIR_Op<"get_global",
   let assemblyFormat = [{
     $name `:` qualified(type($addr)) attr-dict
   }];
-
-  // `GetGlobalOp` is fully verified by its traits.
-  let hasVerifier = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1837,7 +1831,6 @@ class CIR_CallOpBase<string mnemonic, list<Trait> extra_traits = []>
 
   let hasCustomAssemblyFormat = 1;
   let skipDefaultBuilders = 1;
-  let hasVerifier = 0;
 
   // TODO(cir): for now cir.call is just a tiny shell of what it will become.
   // More attributes, arguments, and properties will be added in the future as
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
index 10e5d15..00f67e2 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
@@ -141,4 +141,37 @@ def CIR_AnyIntOrFloatType : AnyTypeOf<[CIR_AnyFloatType, CIR_AnyIntType],
     let cppFunctionName = "isAnyIntegerOrFloatingPointType";
 }
 
+//===----------------------------------------------------------------------===//
+// Pointer Type predicates
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyPtrType : CIR_TypeBase<"::cir::PointerType", "pointer type">;
+
+// Pointer to type constraint bases
+class CIR_IsPtrToPred<code type> : CPred<"$_self.isPtrTo<" # type # ">()">;
+
+class CIR_PtrTo<code type, string summary>
+    : CIR_ConfinedType<CIR_AnyPtrType, [CIR_IsPtrToPred<type>],
+        "pointer to " # summary>;
+
+// Pointer to pointer constraint bases
+class CIR_IsPtrToPtrToPred<code type>
+    : CPred<"$_self.isPtrToPtrTo<" # type # ">()">;
+
+class CIR_PtrToPtrTo<code type, string summary>
+    : CIR_ConfinedType<CIR_AnyPtrType, [CIR_IsPtrToPtrToPred<type>],
+        "pointer to pointer to " # summary>;
+
+// Void pointer type constraints
+def CIR_VoidPtrType
+    : CIR_PtrTo<"::cir::VoidType", "void type">,
+      BuildableType<"$_builder.getType<" # cppType # ">("
+        "cir::VoidType::get($_builder.getContext()))">;
+
+def CIR_PtrToVoidPtrType
+    : CIR_PtrToPtrTo<"::cir::VoidType", "void type">,
+      BuildableType<"$_builder.getType<" # cppType # ">("
+        "$_builder.getType<" # cppType # ">("
+        "cir::VoidType::get($_builder.getContext())))">;
+
 #endif // CLANG_CIR_DIALECT_IR_CIRTYPECONSTRAINTS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index 959e2cd..26f1122 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -197,8 +197,30 @@ def CIR_PointerType : CIR_Type<"Pointer", "ptr",
   let skipDefaultBuilders = 1;
 
   let extraClassDeclaration = [{
+    template <typename ...Types>
+    bool isPtrTo() const {
+      return mlir::isa< Types... >(getPointee());
+    }
+
     bool isVoidPtr() const {
-      return mlir::isa<cir::VoidType>(getPointee());
+      return isPtrTo<cir::VoidType>();
+    }
+
+    template <typename ...Types>
+    bool isPtrToPtrTo() const {
+      if (auto ptrType = mlir::dyn_cast<cir::PointerType>(getPointee()))
+        return ptrType.isPtrTo<Types...>();
+      return false;
+    }
+
+    bool isPtrTo(mlir::Type type) const {
+      return getPointee() == type;
+    }
+
+    bool isPtrToPtrTo(mlir::Type type) const {
+      if (auto ptrType = mlir::dyn_cast<cir::PointerType>(getPointee()))
+        return ptrType.isPtrTo(type);
+      return false;
     }
   }];
 }
@@ -368,20 +390,6 @@ def CIR_VoidType : CIR_Type<"Void", "void"> {
   }];
 }
 
-// Constraints
-
-// Pointer to void
-def VoidPtr : Type<
-    And<[
-      CPred<"::mlir::isa<::cir::PointerType>($_self)">,
-      CPred<"::mlir::isa<::cir::VoidType>("
-            "::mlir::cast<::cir::PointerType>($_self).getPointee())">,
-    ]>, "void*">,
-    BuildableType<
-      "cir::PointerType::get($_builder.getContext(),"
-      "cir::VoidType::get($_builder.getContext()))"> {
-}
-
 //===----------------------------------------------------------------------===//
 // RecordType
 //
diff --git a/clang/include/clang/CIR/Dialect/Passes.h b/clang/include/clang/CIR/Dialect/Passes.h
index 133eb46..dbecf81 100644
--- a/clang/include/clang/CIR/Dialect/Passes.h
+++ b/clang/include/clang/CIR/Dialect/Passes.h
@@ -22,6 +22,7 @@ namespace mlir {
 
 std::unique_ptr<Pass> createCIRCanonicalizePass();
 std::unique_ptr<Pass> createCIRFlattenCFGPass();
+std::unique_ptr<Pass> createCIRSimplifyPass();
 std::unique_ptr<Pass> createHoistAllocasPass();
 
 void populateCIRPreLoweringPasses(mlir::OpPassManager &pm);
diff --git a/clang/include/clang/CIR/Dialect/Passes.td b/clang/include/clang/CIR/Dialect/Passes.td
index 74c2558..de775e69 100644
--- a/clang/include/clang/CIR/Dialect/Passes.td
+++ b/clang/include/clang/CIR/Dialect/Passes.td
@@ -29,6 +29,25 @@ def CIRCanonicalize : Pass<"cir-canonicalize"> {
   let dependentDialects = ["cir::CIRDialect"];
 }
 
+def CIRSimplify : Pass<"cir-simplify"> {
+  let summary = "Performs CIR simplification and code optimization";
+  let description = [{
+    The pass performs semantics-preserving code simplifications and optimizations
+    on CIR while maintaining strict program correctness. 
+    
+    Unlike the `cir-canonicalize` pass, these transformations may reduce the IR's
+    structural similarity to the original source code as a trade-off for improved
+    code quality. This can affect debugging fidelity by altering intermediate
+    representations of folded expressions, hoisted operations, and other 
+    optimized constructs.
+    
+    Example transformations include ternary expression folding and code hoisting
+    while preserving program semantics.
+  }];
+  let constructor = "mlir::createCIRSimplifyPass()";
+  let dependentDialects = ["cir::CIRDialect"];
+}
+
 def HoistAllocas : Pass<"cir-hoist-allocas"> {
   let summary = "Hoist allocas to the entry of the function";
   let description = [{
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index eb75a07..fb4d8b1 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -111,6 +111,10 @@ struct MissingFeatures {
   // Unary operator handling
   static bool opUnaryPromotionType() { return false; }
 
+  // SwitchOp handling
+  static bool foldCascadingCases() { return false; }
+  static bool foldRangeCase() { return false; }
+
   // Clang early optimizations or things defered to LLVM lowering.
   static bool mayHaveIntegerOverflow() { return false; }
   static bool shouldReverseUnaryCondOnBoolExpr() { return false; }
@@ -176,7 +180,6 @@ struct MissingFeatures {
   static bool targetSpecificCXXABI() { return false; }
   static bool moduleNameHash() { return false; }
   static bool setDSOLocal() { return false; }
-  static bool foldCaseStmt() { return false; }
   static bool constantFoldSwitchStatement() { return false; }
   static bool cudaSupport() { return false; }
   static bool maybeHandleStaticInExternC() { return false; }
@@ -206,7 +209,6 @@ struct MissingFeatures {
   static bool labelOp() { return false; }
   static bool ptrDiffOp() { return false; }
   static bool ptrStrideOp() { return false; }
-  static bool selectOp() { return false; }
   static bool switchOp() { return false; }
   static bool ternaryOp() { return false; }
   static bool tryOp() { return false; }
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1167762..351e1ad 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2365,6 +2365,13 @@ def fsanitize_coverage_ignorelist : Joined<["-"], "fsanitize-coverage-ignorelist
     HelpText<"Disable sanitizer coverage instrumentation for modules and functions "
              "that match the provided special case list, even the allowed ones">,
     MarshallingInfoStringVector<CodeGenOpts<"SanitizeCoverageIgnorelistFiles">>;
+def fsanitize_coverage_stack_depth_callback_min_EQ
+    : Joined<["-"], "fsanitize-coverage-stack-depth-callback-min=">,
+      Group<f_clang_Group>,
+      MetaVarName<"<M>">,
+      HelpText<"Use callback for max stack depth tracing with minimum stack "
+               "depth M">,
+      MarshallingInfoInt<CodeGenOpts<"SanitizeCoverageStackDepthCallbackMin">>;
 def fexperimental_sanitize_metadata_EQ : CommaJoined<["-"], "fexperimental-sanitize-metadata=">,
   Group<f_Group>,
   HelpText<"Specify the type of metadata to emit for binary analysis sanitizers">;
@@ -2533,6 +2540,31 @@ def fno_sanitize_merge_handlers : Flag<["-"], "fno-sanitize-merge">, Group<f_cla
                         Alias<fno_sanitize_merge_handlers_EQ>, AliasArgs<["all"]>,
                         Visibility<[ClangOption, CLOption]>,
                         HelpText<"Do not allow compiler to merge handlers for any sanitizers">;
+def fsanitize_annotate_debug_info_EQ
+    : CommaJoined<["-"], "fsanitize-annotate-debug-info=">,
+      Group<f_clang_Group>,
+      HelpText<"Annotate sanitizer instrumentation with extra debug info for "
+               "the specified sanitizers, if supported">;
+def fno_sanitize_annotate_debug_info_EQ
+    : CommaJoined<["-"], "fno-sanitize-annotate-debug-info=">,
+      Group<f_clang_Group>,
+      HelpText<"Do not allow compiler to annotate sanitizer instrumentation "
+               "with extra debug info for the specified sanitizers">;
+def fsanitize_annotate_debug_info
+    : Flag<["-"], "fsanitize-annotate-debug-info">,
+      Group<f_clang_Group>,
+      Alias<fsanitize_annotate_debug_info_EQ>,
+      AliasArgs<["all"]>,
+      HelpText<"Allow compiler to annotate sanitizer instrumentation with "
+               "extra debug info for all sanitizers, where supported">;
+def fno_sanitize_annotate_debug_info
+    : Flag<["-"], "fno-sanitize-annotate-debug-info">,
+      Group<f_clang_Group>,
+      Alias<fno_sanitize_annotate_debug_info_EQ>,
+      AliasArgs<["all"]>,
+      Visibility<[ClangOption, CLOption]>,
+      HelpText<"Do not allow compiler to annotate sanitizer instrumentation "
+               "with extra debug info for any sanitizers">;
 def fsanitize_undefined_trap_on_error
     : Flag<["-"], "fsanitize-undefined-trap-on-error">, Group<f_clang_Group>,
       Alias<fsanitize_trap_EQ>, AliasArgs<["undefined"]>;
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 528e3b40..2b72268 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -27,6 +27,7 @@ class SanitizerArgs {
   SanitizerSet TrapSanitizers;
   SanitizerSet MergeHandlers;
   SanitizerMaskCutoffs SkipHotCutoffs;
+  SanitizerSet AnnotateDebugInfo;
 
   std::vector<std::string> UserIgnorelistFiles;
   std::vector<std::string> SystemIgnorelistFiles;
@@ -34,6 +35,7 @@ class SanitizerArgs {
   std::vector<std::string> CoverageIgnorelistFiles;
   std::vector<std::string> BinaryMetadataIgnorelistFiles;
   int CoverageFeatures = 0;
+  int CoverageStackDepthCallbackMin = 0;
   int BinaryMetadataFeatures = 0;
   int OverflowPatternExclusions = 0;
   int MsanTrackOrigins = 0;
diff --git a/clang/include/clang/Lex/HeaderSearch.h b/clang/include/clang/Lex/HeaderSearch.h
index bccec4d..2e0c8be 100644
--- a/clang/include/clang/Lex/HeaderSearch.h
+++ b/clang/include/clang/Lex/HeaderSearch.h
@@ -332,13 +332,27 @@ class HeaderSearch {
   /// The mapping between modules and headers.
   mutable ModuleMap ModMap;
 
+  struct ModuleMapDirectoryState {
+    OptionalFileEntryRef ModuleMapFile;
+    enum {
+      Parsed,
+      Loaded,
+      Invalid,
+    } Status;
+  };
+
   /// Describes whether a given directory has a module map in it.
-  llvm::DenseMap<const DirectoryEntry *, bool> DirectoryHasModuleMap;
+  llvm::DenseMap<const DirectoryEntry *, ModuleMapDirectoryState>
+      DirectoryModuleMap;
 
   /// Set of module map files we've already loaded, and a flag indicating
   /// whether they were valid or not.
   llvm::DenseMap<const FileEntry *, bool> LoadedModuleMaps;
 
+  /// Set of module map files we've already parsed, and a flag indicating
+  /// whether they were valid or not.
+  llvm::DenseMap<const FileEntry *, bool> ParsedModuleMaps;
+
   // A map of discovered headers with their associated include file name.
   llvm::DenseMap<const FileEntry *, llvm::SmallString<64>> IncludeNames;
 
@@ -433,11 +447,6 @@ public:
   /// Retrieve the path to the module cache.
   StringRef getModuleCachePath() const { return ModuleCachePath; }
 
-  /// Consider modules when including files from this directory.
-  void setDirectoryHasModuleMap(const DirectoryEntry* Dir) {
-    DirectoryHasModuleMap[Dir] = true;
-  }
-
   /// Forget everything we know about headers so far.
   void ClearFileInfo() {
     FileInfo.clear();
@@ -713,9 +722,10 @@ public:
   ///        used to resolve paths within the module (this is required when
   ///        building the module from preprocessed source).
   /// \returns true if an error occurred, false otherwise.
-  bool loadModuleMapFile(FileEntryRef File, bool IsSystem, FileID ID = FileID(),
-                         unsigned *Offset = nullptr,
-                         StringRef OriginalModuleMapFile = StringRef());
+  bool parseAndLoadModuleMapFile(FileEntryRef File, bool IsSystem,
+                                 FileID ID = FileID(),
+                                 unsigned *Offset = nullptr,
+                                 StringRef OriginalModuleMapFile = StringRef());
 
   /// Collect the set of all known, top-level modules.
   ///
@@ -915,26 +925,31 @@ public:
   size_t getTotalMemory() const;
 
 private:
-  /// Describes what happened when we tried to load a module map file.
-  enum LoadModuleMapResult {
-    /// The module map file had already been loaded.
-    LMM_AlreadyLoaded,
+  /// Describes what happened when we tried to load or parse a module map file.
+  enum ModuleMapResult {
+    /// The module map file had already been processed.
+    MMR_AlreadyProcessed,
 
-    /// The module map file was loaded by this invocation.
-    LMM_NewlyLoaded,
+    /// The module map file was processed by this invocation.
+    MMR_NewlyProcessed,
 
     /// There is was directory with the given name.
-    LMM_NoDirectory,
+    MMR_NoDirectory,
 
     /// There was either no module map file or the module map file was
     /// invalid.
-    LMM_InvalidModuleMap
+    MMR_InvalidModuleMap
   };
 
-  LoadModuleMapResult loadModuleMapFileImpl(FileEntryRef File, bool IsSystem,
-                                            DirectoryEntryRef Dir,
-                                            FileID ID = FileID(),
-                                            unsigned *Offset = nullptr);
+  ModuleMapResult parseAndLoadModuleMapFileImpl(FileEntryRef File,
+                                                bool IsSystem,
+                                                DirectoryEntryRef Dir,
+                                                FileID ID = FileID(),
+                                                unsigned *Offset = nullptr);
+
+  ModuleMapResult parseModuleMapFileImpl(FileEntryRef File, bool IsSystem,
+                                         DirectoryEntryRef Dir,
+                                         FileID ID = FileID());
 
   /// Try to load the module map file in the given directory.
   ///
@@ -945,8 +960,8 @@ private:
   ///
   /// \returns The result of attempting to load the module map file from the
   /// named directory.
-  LoadModuleMapResult loadModuleMapFile(StringRef DirName, bool IsSystem,
-                                        bool IsFramework);
+  ModuleMapResult parseAndLoadModuleMapFile(StringRef DirName, bool IsSystem,
+                                            bool IsFramework);
 
   /// Try to load the module map file in the given directory.
   ///
@@ -956,8 +971,13 @@ private:
   ///
   /// \returns The result of attempting to load the module map file from the
   /// named directory.
-  LoadModuleMapResult loadModuleMapFile(DirectoryEntryRef Dir, bool IsSystem,
-                                        bool IsFramework);
+  ModuleMapResult parseAndLoadModuleMapFile(DirectoryEntryRef Dir,
+                                            bool IsSystem, bool IsFramework);
+
+  ModuleMapResult parseModuleMapFile(StringRef DirName, bool IsSystem,
+                                     bool IsFramework);
+  ModuleMapResult parseModuleMapFile(DirectoryEntryRef Dir, bool IsSystem,
+                                     bool IsFramework);
 };
 
 /// Apply the header search options to get given HeaderSearch object.
diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h
index 43c3890..d6a1e14 100644
--- a/clang/include/clang/Lex/ModuleMap.h
+++ b/clang/include/clang/Lex/ModuleMap.h
@@ -18,6 +18,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Lex/ModuleMapFile.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -262,6 +263,18 @@ private:
   /// Describes whether we haved loaded a particular file as a module
   /// map.
   llvm::DenseMap<const FileEntry *, bool> LoadedModuleMap;
+  llvm::DenseMap<const FileEntry *, const modulemap::ModuleMapFile *>
+      ParsedModuleMap;
+
+  std::vector<std::unique_ptr<modulemap::ModuleMapFile>> ParsedModuleMaps;
+
+  /// Map from top level module name to a list of ModuleDecls in the order they
+  /// were discovered. This allows handling shadowing correctly and diagnosing
+  /// redefinitions.
+  llvm::StringMap<SmallVector<std::pair<const modulemap::ModuleMapFile *,
+                                        const modulemap::ModuleDecl *>,
+                              1>>
+      ParsedModules;
 
   /// Resolve the given export declaration into an actual export
   /// declaration.
@@ -478,6 +491,8 @@ public:
   /// \returns The named module, if known; otherwise, returns null.
   Module *findModule(StringRef Name) const;
 
+  Module *findOrLoadModule(StringRef Name);
+
   Module *findOrInferSubmodule(Module *Parent, StringRef Name);
 
   /// Retrieve a module with the given name using lexical name lookup,
@@ -693,6 +708,11 @@ public:
   void addHeader(Module *Mod, Module::Header Header,
                  ModuleHeaderRole Role, bool Imported = false);
 
+  /// Parse a module map without creating `clang::Module` instances.
+  bool parseModuleMapFile(FileEntryRef File, bool IsSystem,
+                          DirectoryEntryRef Dir, FileID ID = FileID(),
+                          SourceLocation ExternModuleLoc = SourceLocation());
+
   /// Load the given module map file, and record any modules we
   /// encounter.
   ///
@@ -713,10 +733,11 @@ public:
   ///        that caused us to load this module map file, if any.
   ///
   /// \returns true if an error occurred, false otherwise.
-  bool loadModuleMapFile(FileEntryRef File, bool IsSystem,
-                         DirectoryEntryRef HomeDir, FileID ID = FileID(),
-                         unsigned *Offset = nullptr,
-                         SourceLocation ExternModuleLoc = SourceLocation());
+  bool
+  parseAndLoadModuleMapFile(FileEntryRef File, bool IsSystem,
+                            DirectoryEntryRef HomeDir, FileID ID = FileID(),
+                            unsigned *Offset = nullptr,
+                            SourceLocation ExternModuleLoc = SourceLocation());
 
   /// Dump the contents of the module map, for debugging purposes.
   void dump();
diff --git a/clang/include/clang/Lex/ModuleMapFile.h b/clang/include/clang/Lex/ModuleMapFile.h
index 1219cc2..7d0e36e 100644
--- a/clang/include/clang/Lex/ModuleMapFile.h
+++ b/clang/include/clang/Lex/ModuleMapFile.h
@@ -133,8 +133,17 @@ using TopLevelDecl = std::variant<ModuleDecl, ExternModuleDecl>;
 /// This holds many reference types (StringRef, SourceLocation, etc.) whose
 /// lifetimes are bound by the SourceManager and FileManager used.
 struct ModuleMapFile {
+  /// The FileID used to parse this module map. This is always a local ID.
+  FileID ID;
+
+  /// The directory in which the module map was discovered. Declarations in
+  /// the module map are relative to this directory.
+  OptionalDirectoryEntryRef Dir;
+
   /// Beginning of the file, used for moduleMapFileRead callback.
   SourceLocation Start;
+
+  bool IsSystem;
   std::vector<TopLevelDecl> Decls;
 
   void dump(llvm::raw_ostream &out) const;
diff --git a/clang/include/clang/Serialization/ModuleCache.h b/clang/include/clang/Serialization/ModuleCache.h
index a7ba26b..3117d95 100644
--- a/clang/include/clang/Serialization/ModuleCache.h
+++ b/clang/include/clang/Serialization/ModuleCache.h
@@ -12,6 +12,8 @@
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 
+#include <ctime>
+
 namespace llvm {
 class AdvisoryLock;
 } // namespace llvm
@@ -31,11 +33,23 @@ public:
   virtual std::unique_ptr<llvm::AdvisoryLock>
   getLock(StringRef ModuleFilename) = 0;
 
+  // TODO: Abstract away timestamps with isUpToDate() and markUpToDate().
+  // TODO: Consider exposing a "validation lock" API to prevent multiple clients
+  // concurrently noticing an out-of-date module file and validating its inputs.
+
+  /// Returns the timestamp denoting the last time inputs of the module file
+  /// were validated.
+  virtual std::time_t getModuleTimestamp(StringRef ModuleFilename) = 0;
+
+  /// Updates the timestamp denoting the last time inputs of the module file
+  /// were validated.
+  virtual void updateModuleTimestamp(StringRef ModuleFilename) = 0;
+
   /// Returns this process's view of the module cache.
   virtual InMemoryModuleCache &getInMemoryModuleCache() = 0;
   virtual const InMemoryModuleCache &getInMemoryModuleCache() const = 0;
 
-  // TODO: Virtualize writing/reading PCM files, timestamping, pruning, etc.
+  // TODO: Virtualize writing/reading PCM files, pruning, etc.
 
   virtual ~ModuleCache() = default;
 };
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
index 5e8b37e..4e97c7b 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
@@ -12,6 +12,7 @@
 #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h"
 #include "clang/Tooling/DependencyScanning/InProcessModuleCache.h"
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/Support/Chrono.h"
 
 namespace clang {
 namespace tooling {
@@ -84,7 +85,9 @@ public:
   DependencyScanningService(
       ScanningMode Mode, ScanningOutputFormat Format,
       ScanningOptimizations OptimizeArgs = ScanningOptimizations::Default,
-      bool EagerLoadModules = false, bool TraceVFS = false);
+      bool EagerLoadModules = false, bool TraceVFS = false,
+      std::time_t BuildSessionTimestamp =
+          llvm::sys::toTimeT(std::chrono::system_clock::now()));
 
   ScanningMode getMode() const { return Mode; }
 
@@ -100,7 +103,9 @@ public:
     return SharedCache;
   }
 
-  ModuleCacheMutexes &getModuleCacheMutexes() { return ModCacheMutexes; }
+  ModuleCacheEntries &getModuleCacheEntries() { return ModCacheEntries; }
+
+  std::time_t getBuildSessionTimestamp() const { return BuildSessionTimestamp; }
 
 private:
   const ScanningMode Mode;
@@ -113,8 +118,10 @@ private:
   const bool TraceVFS;
   /// The global file system cache.
   DependencyScanningFilesystemSharedCache SharedCache;
-  /// The global module cache mutexes.
-  ModuleCacheMutexes ModCacheMutexes;
+  /// The global module cache entries.
+  ModuleCacheEntries ModCacheEntries;
+  /// The build session timestamp.
+  std::time_t BuildSessionTimestamp;
 };
 
 } // end namespace dependencies
diff --git a/clang/include/clang/Tooling/DependencyScanning/InProcessModuleCache.h b/clang/include/clang/Tooling/DependencyScanning/InProcessModuleCache.h
index ba045438..213e60b 100644
--- a/clang/include/clang/Tooling/DependencyScanning/InProcessModuleCache.h
+++ b/clang/include/clang/Tooling/DependencyScanning/InProcessModuleCache.h
@@ -18,13 +18,18 @@
 namespace clang {
 namespace tooling {
 namespace dependencies {
-struct ModuleCacheMutexes {
+struct ModuleCacheEntry {
+  std::shared_mutex CompilationMutex;
+  std::atomic<std::time_t> Timestamp = 0;
+};
+
+struct ModuleCacheEntries {
   std::mutex Mutex;
-  llvm::StringMap<std::unique_ptr<std::shared_mutex>> Map;
+  llvm::StringMap<std::unique_ptr<ModuleCacheEntry>> Map;
 };
 
 IntrusiveRefCntPtr<ModuleCache>
-makeInProcessModuleCache(ModuleCacheMutexes &Mutexes);
+makeInProcessModuleCache(ModuleCacheEntries &Entries);
 } // namespace dependencies
 } // namespace tooling
 } // namespace clang
diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h
index 9392351..bb0c276 100644
--- a/clang/lib/APINotes/APINotesFormat.h
+++ b/clang/lib/APINotes/APINotesFormat.h
@@ -24,7 +24,7 @@ const uint16_t VERSION_MAJOR = 0;
 /// API notes file minor version number.
 ///
 /// When the format changes IN ANY WAY, this number should be incremented.
-const uint16_t VERSION_MINOR = 34; // SwiftReturnOwnership
+const uint16_t VERSION_MINOR = 35; // SwiftDefaultOwnership
 
 const uint8_t kSwiftConforms = 1;
 const uint8_t kSwiftDoesNotConform = 2;
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index 646eabd..2ba30ca 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -624,6 +624,13 @@ public:
                                         ReleaseOpLength - 1);
       Data += ReleaseOpLength - 1;
     }
+    unsigned DefaultOwnershipLength =
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
+    if (DefaultOwnershipLength > 0) {
+      Info.SwiftDefaultOwnership = std::string(
+          reinterpret_cast<const char *>(Data), DefaultOwnershipLength - 1);
+      Data += DefaultOwnershipLength - 1;
+    }
     if (unsigned ConformanceLength =
             endian::readNext<uint16_t, llvm::endianness::little>(Data)) {
       Info.SwiftConformance = std::string(reinterpret_cast<const char *>(Data),
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index 1aae07b..7578bc3 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -1274,6 +1274,7 @@ public:
     return 2 + (TI.SwiftImportAs ? TI.SwiftImportAs->size() : 0) +
            2 + (TI.SwiftRetainOp ? TI.SwiftRetainOp->size() : 0) +
            2 + (TI.SwiftReleaseOp ? TI.SwiftReleaseOp->size() : 0) +
+           2 + (TI.SwiftDefaultOwnership ? TI.SwiftDefaultOwnership->size() : 0) +
            2 + (TI.SwiftConformance ? TI.SwiftConformance->size() : 0) +
            3 + getCommonTypeInfoSize(TI);
     // clang-format on
@@ -1322,6 +1323,12 @@ public:
     } else {
       writer.write<uint16_t>(0);
     }
+    if (auto DefaultOwnership = TI.SwiftDefaultOwnership) {
+      writer.write<uint16_t>(DefaultOwnership->size() + 1);
+      OS.write(DefaultOwnership->c_str(), DefaultOwnership->size());
+    } else {
+      writer.write<uint16_t>(0);
+    }
     if (auto Conformance = TI.SwiftConformance) {
       writer.write<uint16_t>(Conformance->size() + 1);
       OS.write(Conformance->c_str(), Conformance->size());
diff --git a/clang/lib/APINotes/APINotesYAMLCompiler.cpp b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
index 414a59a..803410c 100644
--- a/clang/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -460,6 +460,7 @@ struct Tag {
   std::optional<std::string> SwiftImportAs;
   std::optional<std::string> SwiftRetainOp;
   std::optional<std::string> SwiftReleaseOp;
+  std::optional<std::string> SwiftDefaultOwnership;
   std::optional<std::string> SwiftConformance;
   std::optional<EnumExtensibilityKind> EnumExtensibility;
   std::optional<bool> FlagEnum;
@@ -500,6 +501,7 @@ template <> struct MappingTraits<Tag> {
     IO.mapOptional("SwiftImportAs", T.SwiftImportAs);
     IO.mapOptional("SwiftReleaseOp", T.SwiftReleaseOp);
     IO.mapOptional("SwiftRetainOp", T.SwiftRetainOp);
+    IO.mapOptional("SwiftDefaultOwnership", T.SwiftDefaultOwnership);
     IO.mapOptional("SwiftConformsTo", T.SwiftConformance);
     IO.mapOptional("EnumExtensibility", T.EnumExtensibility);
     IO.mapOptional("FlagEnum", T.FlagEnum);
@@ -990,6 +992,8 @@ public:
       TI.SwiftReleaseOp = T.SwiftReleaseOp;
     if (T.SwiftConformance)
       TI.SwiftConformance = T.SwiftConformance;
+    if (T.SwiftDefaultOwnership)
+      TI.SwiftDefaultOwnership = T.SwiftDefaultOwnership;
 
     if (T.SwiftCopyable)
       TI.setSwiftCopyable(T.SwiftCopyable);
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a4c89d6..1ed1674 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -7660,6 +7660,55 @@ ASTContext::getCanonicalTemplateArgument(const TemplateArgument &Arg) const {
   llvm_unreachable("Unhandled template argument kind");
 }
 
+bool ASTContext::isSameTemplateArgument(const TemplateArgument &Arg1,
+                                        const TemplateArgument &Arg2) const {
+  if (Arg1.getKind() != Arg2.getKind())
+    return false;
+
+  switch (Arg1.getKind()) {
+  case TemplateArgument::Null:
+    llvm_unreachable("Comparing NULL template argument");
+
+  case TemplateArgument::Type:
+    return hasSameType(Arg1.getAsType(), Arg2.getAsType());
+
+  case TemplateArgument::Declaration:
+    return Arg1.getAsDecl()->getUnderlyingDecl()->getCanonicalDecl() ==
+           Arg2.getAsDecl()->getUnderlyingDecl()->getCanonicalDecl();
+
+  case TemplateArgument::NullPtr:
+    return hasSameType(Arg1.getNullPtrType(), Arg2.getNullPtrType());
+
+  case TemplateArgument::Template:
+  case TemplateArgument::TemplateExpansion:
+    return getCanonicalTemplateName(Arg1.getAsTemplateOrTemplatePattern()) ==
+           getCanonicalTemplateName(Arg2.getAsTemplateOrTemplatePattern());
+
+  case TemplateArgument::Integral:
+    return llvm::APSInt::isSameValue(Arg1.getAsIntegral(),
+                                     Arg2.getAsIntegral());
+
+  case TemplateArgument::StructuralValue:
+    return Arg1.structurallyEquals(Arg2);
+
+  case TemplateArgument::Expression: {
+    llvm::FoldingSetNodeID ID1, ID2;
+    Arg1.getAsExpr()->Profile(ID1, *this, /*Canonical=*/true);
+    Arg2.getAsExpr()->Profile(ID2, *this, /*Canonical=*/true);
+    return ID1 == ID2;
+  }
+
+  case TemplateArgument::Pack:
+    return llvm::equal(
+        Arg1.getPackAsArray(), Arg2.getPackAsArray(),
+        [&](const TemplateArgument &Arg1, const TemplateArgument &Arg2) {
+          return isSameTemplateArgument(Arg1, Arg2);
+        });
+  }
+
+  llvm_unreachable("Unhandled template argument kind");
+}
+
 NestedNameSpecifier *
 ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const {
   if (!NNS)
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 52b922e..392a95d0 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -4889,8 +4889,8 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
   QualType type = getCanonicalTypeInternal();
 
   switch (type->getTypeClass()) {
-    // We'll only see canonical types here.
 #define NON_CANONICAL_TYPE(Class, Parent)                                      \
+  /* We'll only see canonical types here. */                                   \
   case Type::Class:                                                            \
     llvm_unreachable("non-canonical type");
 #define TYPE(Class, Parent)
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 94a6c03..711a652 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -42,7 +42,7 @@ Address CIRGenFunction::emitAddrOfFieldStorage(Address base,
   mlir::Location loc = getLoc(field->getLocation());
 
   mlir::Type fieldType = convertType(field->getType());
-  auto fieldPtr = cir::PointerType::get(builder.getContext(), fieldType);
+  auto fieldPtr = cir::PointerType::get(fieldType);
   // For most cases fieldName is the same as field->getName() but for lambdas,
   // which do not currently carry the name, so it can be passed down from the
   // CaptureStmt.
@@ -322,9 +322,12 @@ LValue CIRGenFunction::emitLValueForField(LValue base, const FieldDecl *field) {
   assert(!cir::MissingFeatures::opTBAA());
 
   Address addr = base.getAddress();
-  if (isa<CXXRecordDecl>(rec)) {
-    cgm.errorNYI(field->getSourceRange(), "emitLValueForField: C++ class");
-    return LValue();
+  if (auto *classDecl = dyn_cast<CXXRecordDecl>(rec)) {
+    if (cgm.getCodeGenOpts().StrictVTablePointers &&
+        classDecl->isDynamicClass()) {
+      cgm.errorNYI(field->getSourceRange(),
+                   "emitLValueForField: strict vtable for dynamic class");
+    }
   }
 
   unsigned recordCVR = base.getVRQualifiers();
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 2d8550f..9085ee2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -14,6 +14,7 @@
 #include "CIRGenConstantEmitter.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
+#include "CIRGenRecordLayout.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -365,12 +366,33 @@ mlir::Attribute ConstantEmitter::tryEmitPrivateForVarInit(const VarDecl &d) {
   // initialization of memory to all NULLs.
   if (!d.hasLocalStorage()) {
     QualType ty = cgm.getASTContext().getBaseElementType(d.getType());
-    if (ty->isRecordType())
-      if (d.getInit() && isa<CXXConstructExpr>(d.getInit())) {
-        cgm.errorNYI(d.getInit()->getBeginLoc(),
-                     "tryEmitPrivateForVarInit CXXConstructExpr");
-        return {};
+    if (ty->isRecordType()) {
+      if (const auto *e = dyn_cast_or_null<CXXConstructExpr>(d.getInit())) {
+        const CXXConstructorDecl *cd = e->getConstructor();
+        // FIXME: we should probably model this more closely to C++ than
+        // just emitting a global with zero init (mimic what we do for trivial
+        // assignments and whatnots). Since this is for globals shouldn't
+        // be a problem for the near future.
+        if (cd->isTrivial() && cd->isDefaultConstructor()) {
+          const auto *cxxrd =
+              cast<CXXRecordDecl>(ty->getAs<RecordType>()->getDecl());
+          if (cxxrd->getNumBases() != 0) {
+            // There may not be anything additional to do here, but this will
+            // force us to pause and test this path when it is supported.
+            cgm.errorNYI("tryEmitPrivateForVarInit: cxx record with bases");
+            return {};
+          }
+          if (!cgm.getTypes().isZeroInitializable(cxxrd)) {
+            // To handle this case, we really need to go through
+            // emitNullConstant, but we need an attribute, not a value
+            cgm.errorNYI(
+                "tryEmitPrivateForVarInit: non-zero-initializable cxx record");
+            return {};
+          }
+          return cir::ZeroAttr::get(cgm.convertType(d.getType()));
+        }
       }
+    }
   }
   inConstantContext = d.hasConstantInitialization();
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index c3aada8..9066107 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -581,6 +581,10 @@ public:
   mlir::LogicalResult emitDeclStmt(const clang::DeclStmt &s);
   LValue emitDeclRefLValue(const clang::DeclRefExpr *e);
 
+  mlir::LogicalResult emitDefaultStmt(const clang::DefaultStmt &s,
+                                      mlir::Type condType,
+                                      bool buildingTopLevelCase);
+
   /// Emit an `if` on a boolean condition to the specified blocks.
   /// FIXME: Based on the condition, this might try to simplify the codegen of
   /// the conditional based on the branch.
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
index fa4ce5e..ef4f64a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
@@ -392,6 +392,38 @@ public:
       return clauseNotImplemented(clause);
     }
   }
+
+  void VisitWorkerClause(const OpenACCWorkerClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+      if (clause.hasIntExpr())
+        operation.addWorkerNumOperand(builder.getContext(),
+                                      createIntExpr(clause.getIntExpr()),
+                                      lastDeviceTypeValues);
+      else
+        operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues);
+
+    } else {
+      // TODO: When we've implemented this for everything, switch this to an
+      // unreachable. Combined constructs remain.
+      return clauseNotImplemented(clause);
+    }
+  }
+
+  void VisitVectorClause(const OpenACCVectorClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+      if (clause.hasIntExpr())
+        operation.addVectorOperand(builder.getContext(),
+                                   createIntExpr(clause.getIntExpr()),
+                                   lastDeviceTypeValues);
+      else
+        operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues);
+
+    } else {
+      // TODO: When we've implemented this for everything, switch this to an
+      // unreachable. Combined constructs remain.
+      return clauseNotImplemented(clause);
+    }
+  }
 };
 
 template <typename OpTy>
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
index 11768b0..2ece85b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
@@ -33,9 +33,23 @@ private:
   /// field no. This info is populated by the record builder.
   llvm::DenseMap<const clang::FieldDecl *, unsigned> fieldIdxMap;
 
+  /// False if any direct or indirect subobject of this class, when considered
+  /// as a complete object, requires a non-zero bitpattern when
+  /// zero-initialized.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned zeroInitializable : 1;
+
+  /// False if any direct or indirect subobject of this class, when considered
+  /// as a base subobject, requires a non-zero bitpattern when zero-initialized.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned zeroInitializableAsBase : 1;
+
 public:
-  CIRGenRecordLayout(cir::RecordType completeObjectType)
-      : completeObjectType(completeObjectType) {}
+  CIRGenRecordLayout(cir::RecordType completeObjectType, bool zeroInitializable,
+                     bool zeroInitializableAsBase)
+      : completeObjectType(completeObjectType),
+        zeroInitializable(zeroInitializable),
+        zeroInitializableAsBase(zeroInitializableAsBase) {}
 
   /// Return the "complete object" LLVM type associated with
   /// this record.
@@ -47,6 +61,14 @@ public:
     assert(fieldIdxMap.count(fd) && "Invalid field for record!");
     return fieldIdxMap.lookup(fd);
   }
+
+  /// Check whether this struct can be C++ zero-initialized
+  /// with a zeroinitializer.
+  bool isZeroInitializable() const { return zeroInitializable; }
+
+  /// Check whether this struct can be C++ zero-initialized
+  /// with a zeroinitializer when considered as a base subobject.
+  bool isZeroInitializableAsBase() const { return zeroInitializableAsBase; }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
index 5bcd408..53aa0ae 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
@@ -77,6 +77,8 @@ struct CIRRecordLowering final {
     return astContext.toCharUnitsFromBits(bitOffset);
   }
 
+  void calculateZeroInit();
+
   CharUnits getSize(mlir::Type Ty) {
     return CharUnits::fromQuantity(dataLayout.layout.getTypeSize(Ty));
   }
@@ -177,18 +179,26 @@ void CIRRecordLowering::lower() {
     return;
   }
 
-  if (isa<CXXRecordDecl>(recordDecl)) {
-    cirGenTypes.getCGModule().errorNYI(recordDecl->getSourceRange(),
-                                       "lower: class");
-    return;
-  }
-
   assert(!cir::MissingFeatures::cxxSupport());
 
   CharUnits size = astRecordLayout.getSize();
 
   accumulateFields();
 
+  if (const auto *cxxRecordDecl = dyn_cast<CXXRecordDecl>(recordDecl)) {
+    if (cxxRecordDecl->getNumBases() > 0) {
+      CIRGenModule &cgm = cirGenTypes.getCGModule();
+      cgm.errorNYI(recordDecl->getSourceRange(),
+                   "CIRRecordLowering::lower: derived CXXRecordDecl");
+      return;
+    }
+    if (members.empty()) {
+      appendPaddingBytes(size);
+      assert(!cir::MissingFeatures::bitfields());
+      return;
+    }
+  }
+
   llvm::stable_sort(members);
   // TODO: implement clipTailPadding once bitfields are implemented
   assert(!cir::MissingFeatures::bitfields());
@@ -199,6 +209,7 @@ void CIRRecordLowering::lower() {
   insertPadding();
   members.pop_back();
 
+  calculateZeroInit();
   fillOutputFields();
 }
 
@@ -236,6 +247,19 @@ void CIRRecordLowering::accumulateFields() {
   }
 }
 
+void CIRRecordLowering::calculateZeroInit() {
+  for (const MemberInfo &member : members) {
+    if (member.kind == MemberInfo::InfoKind::Field) {
+      if (!member.fieldDecl || isZeroInitializable(member.fieldDecl))
+        continue;
+      zeroInitializable = zeroInitializableAsBase = false;
+      return;
+    }
+    // TODO(cir): handle base types
+    assert(!cir::MissingFeatures::cxxSupport());
+  }
+}
+
 void CIRRecordLowering::determinePacked() {
   if (packed)
     return;
@@ -295,7 +319,10 @@ CIRGenTypes::computeRecordLayout(const RecordDecl *rd, cir::RecordType *ty) {
   // If we're in C++, compute the base subobject type.
   if (llvm::isa<CXXRecordDecl>(rd) && !rd->isUnion() &&
       !rd->hasAttr<FinalAttr>()) {
-    cgm.errorNYI(rd->getSourceRange(), "computeRecordLayout: CXXRecordDecl");
+    if (lowering.astRecordLayout.getNonVirtualSize() !=
+        lowering.astRecordLayout.getSize()) {
+      cgm.errorNYI(rd->getSourceRange(), "computeRecordLayout: CXXRecordDecl");
+    }
   }
 
   // Fill in the record *after* computing the base type.  Filling in the body
@@ -304,7 +331,9 @@ CIRGenTypes::computeRecordLayout(const RecordDecl *rd, cir::RecordType *ty) {
   assert(!cir::MissingFeatures::astRecordDeclAttr());
   ty->complete(lowering.fieldTypes, lowering.packed, lowering.padded);
 
-  auto rl = std::make_unique<CIRGenRecordLayout>(ty ? *ty : cir::RecordType());
+  auto rl = std::make_unique<CIRGenRecordLayout>(
+      ty ? *ty : cir::RecordType(), (bool)lowering.zeroInitializable,
+      (bool)lowering.zeroInitializableAsBase);
 
   assert(!cir::MissingFeatures::recordZeroInit());
   assert(!cir::MissingFeatures::cxxSupport());
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index ee4dcc8..cc96e65 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -254,6 +254,7 @@ mlir::LogicalResult CIRGenFunction::emitSimpleStmt(const Stmt *s,
   case Stmt::NullStmtClass:
     break;
   case Stmt::CaseStmtClass:
+  case Stmt::DefaultStmtClass:
     // If we reached here, we must not handling a switch case in the top level.
     return emitSwitchCase(cast<SwitchCase>(*s),
                           /*buildingTopLevelCase=*/false);
@@ -458,7 +459,7 @@ CIRGenFunction::emitCaseDefaultCascade(const T *stmt, mlir::Type condType,
     if (isa<DefaultStmt>(sub) && isa<CaseStmt>(stmt)) {
       subStmtKind = SubStmtKind::Default;
       builder.createYield(loc);
-    } else if (isa<CaseStmt>(sub) && isa<DefaultStmt>(stmt)) {
+    } else if (isa<CaseStmt>(sub) && isa<DefaultStmt, CaseStmt>(stmt)) {
       subStmtKind = SubStmtKind::Case;
       builder.createYield(loc);
     } else {
@@ -503,8 +504,8 @@ CIRGenFunction::emitCaseDefaultCascade(const T *stmt, mlir::Type condType,
   if (subStmtKind == SubStmtKind::Case) {
     result = emitCaseStmt(*cast<CaseStmt>(sub), condType, buildingTopLevelCase);
   } else if (subStmtKind == SubStmtKind::Default) {
-    getCIRGenModule().errorNYI(sub->getSourceRange(), "Default case");
-    return mlir::failure();
+    result = emitDefaultStmt(*cast<DefaultStmt>(sub), condType,
+                             buildingTopLevelCase);
   } else if (buildingTopLevelCase) {
     // If we're building a top level case, try to restore the insert point to
     // the case we're building, then we can attach more random stmts to the
@@ -518,19 +519,40 @@ CIRGenFunction::emitCaseDefaultCascade(const T *stmt, mlir::Type condType,
 mlir::LogicalResult CIRGenFunction::emitCaseStmt(const CaseStmt &s,
                                                  mlir::Type condType,
                                                  bool buildingTopLevelCase) {
+  cir::CaseOpKind kind;
+  mlir::ArrayAttr value;
   llvm::APSInt intVal = s.getLHS()->EvaluateKnownConstInt(getContext());
-  SmallVector<mlir::Attribute, 1> caseEltValueListAttr;
-  caseEltValueListAttr.push_back(cir::IntAttr::get(condType, intVal));
-  mlir::ArrayAttr value = builder.getArrayAttr(caseEltValueListAttr);
-  if (s.getRHS()) {
-    getCIRGenModule().errorNYI(s.getSourceRange(), "SwitchOp range kind");
-    return mlir::failure();
+
+  // If the case statement has an RHS value, it is representing a GNU
+  // case range statement, where LHS is the beginning of the range
+  // and RHS is the end of the range.
+  if (const Expr *rhs = s.getRHS()) {
+    llvm::APSInt endVal = rhs->EvaluateKnownConstInt(getContext());
+    value = builder.getArrayAttr({cir::IntAttr::get(condType, intVal),
+                                  cir::IntAttr::get(condType, endVal)});
+    kind = cir::CaseOpKind::Range;
+
+    // We don't currently fold case range statements with other case statements.
+    // TODO(cir): Add this capability. Folding these cases is going to be
+    // implemented in CIRSimplify when it is upstreamed.
+    assert(!cir::MissingFeatures::foldRangeCase());
+    assert(!cir::MissingFeatures::foldCascadingCases());
+  } else {
+    value = builder.getArrayAttr({cir::IntAttr::get(condType, intVal)});
+    kind = cir::CaseOpKind::Equal;
   }
-  assert(!cir::MissingFeatures::foldCaseStmt());
-  return emitCaseDefaultCascade(&s, condType, value, cir::CaseOpKind::Equal,
+
+  return emitCaseDefaultCascade(&s, condType, value, kind,
                                 buildingTopLevelCase);
 }
 
+mlir::LogicalResult CIRGenFunction::emitDefaultStmt(const clang::DefaultStmt &s,
+                                                    mlir::Type condType,
+                                                    bool buildingTopLevelCase) {
+  return emitCaseDefaultCascade(&s, condType, builder.getArrayAttr({}),
+                                cir::CaseOpKind::Default, buildingTopLevelCase);
+}
+
 mlir::LogicalResult CIRGenFunction::emitSwitchCase(const SwitchCase &s,
                                                    bool buildingTopLevelCase) {
   assert(!condTypeStack.empty() &&
@@ -540,10 +562,9 @@ mlir::LogicalResult CIRGenFunction::emitSwitchCase(const SwitchCase &s,
     return emitCaseStmt(cast<CaseStmt>(s), condTypeStack.back(),
                         buildingTopLevelCase);
 
-  if (s.getStmtClass() == Stmt::DefaultStmtClass) {
-    getCIRGenModule().errorNYI(s.getSourceRange(), "Default case");
-    return mlir::failure();
-  }
+  if (s.getStmtClass() == Stmt::DefaultStmtClass)
+    return emitDefaultStmt(cast<DefaultStmt>(s), condTypeStack.back(),
+                           buildingTopLevelCase);
 
   llvm_unreachable("expect case or default stmt");
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index e85f2f4..89dc5ee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -237,8 +237,11 @@ mlir::Type CIRGenTypes::convertRecordDeclType(const clang::RecordDecl *rd) {
   assert(insertResult && "isSafeToCovert() should have caught this.");
 
   // Force conversion of non-virtual base classes recursively.
-  if (isa<CXXRecordDecl>(rd)) {
-    cgm.errorNYI(rd->getSourceRange(), "CXXRecordDecl");
+  if (const auto *cxxRecordDecl = dyn_cast<CXXRecordDecl>(rd)) {
+    if (cxxRecordDecl->getNumBases() > 0) {
+      cgm.errorNYI(rd->getSourceRange(),
+                   "convertRecordDeclType: derived CXXRecordDecl");
+    }
   }
 
   // Layout fields.
@@ -497,9 +500,9 @@ bool CIRGenTypes::isZeroInitializable(clang::QualType t) {
         return true;
   }
 
-  if (t->getAs<RecordType>()) {
-    cgm.errorNYI(SourceLocation(), "isZeroInitializable for RecordType", t);
-    return false;
+  if (const RecordType *rt = t->getAs<RecordType>()) {
+    const RecordDecl *rd = rt->getDecl();
+    return isZeroInitializable(rd);
   }
 
   if (t->getAs<MemberPointerType>()) {
@@ -511,6 +514,10 @@ bool CIRGenTypes::isZeroInitializable(clang::QualType t) {
   return true;
 }
 
+bool CIRGenTypes::isZeroInitializable(const RecordDecl *rd) {
+  return getCIRGenRecordLayout(rd).isZeroInitializable();
+}
+
 const CIRGenFunctionInfo &CIRGenTypes::arrangeCIRFunctionInfo(
     CanQualType returnType, llvm::ArrayRef<clang::CanQualType> argTypes) {
   assert(llvm::all_of(argTypes,
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.h b/clang/lib/CIR/CodeGen/CIRGenTypes.h
index 38f4b38..cf94375 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.h
@@ -120,6 +120,7 @@ public:
   /// Return whether a type can be zero-initialized (in the C++ sense) with an
   /// LLVM zeroinitializer.
   bool isZeroInitializable(clang::QualType ty);
+  bool isZeroInitializable(const RecordDecl *rd);
 
   const CIRGenFunctionInfo &arrangeFreeFunctionCall(const CallArgList &args,
                                                     const FunctionType *fnType);
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 6b14414..b131eda 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -79,6 +79,14 @@ void cir::CIRDialect::initialize() {
   addInterfaces<CIROpAsmDialectInterface>();
 }
 
+Operation *cir::CIRDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                mlir::Attribute value,
+                                                mlir::Type type,
+                                                mlir::Location loc) {
+  return builder.create<cir::ConstantOp>(loc, type,
+                                         mlir::cast<mlir::TypedAttr>(value));
+}
+
 //===----------------------------------------------------------------------===//
 // Helpers
 //===----------------------------------------------------------------------===//
@@ -1262,6 +1270,28 @@ void cir::TernaryOp::build(
 }
 
 //===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult cir::SelectOp::fold(FoldAdaptor adaptor) {
+  mlir::Attribute condition = adaptor.getCondition();
+  if (condition) {
+    bool conditionValue = mlir::cast<cir::BoolAttr>(condition).getValue();
+    return conditionValue ? getTrueValue() : getFalseValue();
+  }
+
+  // cir.select if %0 then x else x -> x
+  mlir::Attribute trueValue = adaptor.getTrueValue();
+  mlir::Attribute falseValue = adaptor.getFalseValue();
+  if (trueValue == falseValue)
+    return trueValue;
+  if (getTrueValue() == getFalseValue())
+    return getTrueValue();
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
 // ShiftOp
 //===----------------------------------------------------------------------===//
 LogicalResult cir::ShiftOp::verify() {
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index cdac69e..3b4c7bc 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -121,14 +121,13 @@ void CIRCanonicalizePass::runOnOperation() {
   getOperation()->walk([&](Operation *op) {
     assert(!cir::MissingFeatures::switchOp());
     assert(!cir::MissingFeatures::tryOp());
-    assert(!cir::MissingFeatures::selectOp());
     assert(!cir::MissingFeatures::complexCreateOp());
     assert(!cir::MissingFeatures::complexRealOp());
     assert(!cir::MissingFeatures::complexImagOp());
     assert(!cir::MissingFeatures::callOp());
     // CastOp and UnaryOp are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
-    if (isa<BrOp, BrCondOp, ScopeOp, CastOp, UnaryOp>(op))
+    if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SelectOp, UnaryOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
new file mode 100644
index 0000000..b969569
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
@@ -0,0 +1,202 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Region.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/Passes.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+using namespace cir;
+
+//===----------------------------------------------------------------------===//
+// Rewrite patterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Simplify suitable ternary operations into select operations.
+///
+/// For now we only simplify those ternary operations whose true and false
+/// branches directly yield a value or a constant. That is, both of the true and
+/// the false branch must either contain a cir.yield operation as the only
+/// operation in the branch, or contain a cir.const operation followed by a
+/// cir.yield operation that yields the constant value.
+///
+/// For example, we will simplify the following ternary operation:
+///
+///   %0 = ...
+///   %1 = cir.ternary (%condition, true {
+///     %2 = cir.const ...
+///     cir.yield %2
+///   } false {
+///     cir.yield %0
+///
+/// into the following sequence of operations:
+///
+///   %1 = cir.const ...
+///   %0 = cir.select if %condition then %1 else %2
+struct SimplifyTernary final : public OpRewritePattern<TernaryOp> {
+  using OpRewritePattern<TernaryOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TernaryOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op->getNumResults() != 1)
+      return mlir::failure();
+
+    if (!isSimpleTernaryBranch(op.getTrueRegion()) ||
+        !isSimpleTernaryBranch(op.getFalseRegion()))
+      return mlir::failure();
+
+    cir::YieldOp trueBranchYieldOp =
+        mlir::cast<cir::YieldOp>(op.getTrueRegion().front().getTerminator());
+    cir::YieldOp falseBranchYieldOp =
+        mlir::cast<cir::YieldOp>(op.getFalseRegion().front().getTerminator());
+    mlir::Value trueValue = trueBranchYieldOp.getArgs()[0];
+    mlir::Value falseValue = falseBranchYieldOp.getArgs()[0];
+
+    rewriter.inlineBlockBefore(&op.getTrueRegion().front(), op);
+    rewriter.inlineBlockBefore(&op.getFalseRegion().front(), op);
+    rewriter.eraseOp(trueBranchYieldOp);
+    rewriter.eraseOp(falseBranchYieldOp);
+    rewriter.replaceOpWithNewOp<cir::SelectOp>(op, op.getCond(), trueValue,
+                                               falseValue);
+
+    return mlir::success();
+  }
+
+private:
+  bool isSimpleTernaryBranch(mlir::Region &region) const {
+    if (!region.hasOneBlock())
+      return false;
+
+    mlir::Block &onlyBlock = region.front();
+    mlir::Block::OpListType &ops = onlyBlock.getOperations();
+
+    // The region/block could only contain at most 2 operations.
+    if (ops.size() > 2)
+      return false;
+
+    if (ops.size() == 1) {
+      // The region/block only contain a cir.yield operation.
+      return true;
+    }
+
+    // Check whether the region/block contains a cir.const followed by a
+    // cir.yield that yields the value.
+    auto yieldOp = mlir::cast<cir::YieldOp>(onlyBlock.getTerminator());
+    auto yieldValueDefOp = mlir::dyn_cast_if_present<cir::ConstantOp>(
+        yieldOp.getArgs()[0].getDefiningOp());
+    return yieldValueDefOp && yieldValueDefOp->getBlock() == &onlyBlock;
+  }
+};
+
+/// Simplify select operations with boolean constants into simpler forms.
+///
+/// This pattern simplifies select operations where both true and false values
+/// are boolean constants. Two specific cases are handled:
+///
+/// 1. When selecting between true and false based on a condition,
+///    the operation simplifies to just the condition itself:
+///
+///    %0 = cir.select if %condition then true else false
+///    ->
+///    (replaced with %condition directly)
+///
+/// 2. When selecting between false and true based on a condition,
+///    the operation simplifies to the logical negation of the condition:
+///
+///    %0 = cir.select if %condition then false else true
+///    ->
+///    %0 = cir.unary not %condition
+struct SimplifySelect : public OpRewritePattern<SelectOp> {
+  using OpRewritePattern<SelectOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SelectOp op,
+                                PatternRewriter &rewriter) const final {
+    mlir::Operation *trueValueOp = op.getTrueValue().getDefiningOp();
+    mlir::Operation *falseValueOp = op.getFalseValue().getDefiningOp();
+    auto trueValueConstOp =
+        mlir::dyn_cast_if_present<cir::ConstantOp>(trueValueOp);
+    auto falseValueConstOp =
+        mlir::dyn_cast_if_present<cir::ConstantOp>(falseValueOp);
+    if (!trueValueConstOp || !falseValueConstOp)
+      return mlir::failure();
+
+    auto trueValue = mlir::dyn_cast<cir::BoolAttr>(trueValueConstOp.getValue());
+    auto falseValue =
+        mlir::dyn_cast<cir::BoolAttr>(falseValueConstOp.getValue());
+    if (!trueValue || !falseValue)
+      return mlir::failure();
+
+    // cir.select if %0 then #true else #false -> %0
+    if (trueValue.getValue() && !falseValue.getValue()) {
+      rewriter.replaceAllUsesWith(op, op.getCondition());
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    // cir.select if %0 then #false else #true -> cir.unary not %0
+    if (!trueValue.getValue() && falseValue.getValue()) {
+      rewriter.replaceOpWithNewOp<cir::UnaryOp>(op, cir::UnaryOpKind::Not,
+                                                op.getCondition());
+      return mlir::success();
+    }
+
+    return mlir::failure();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// CIRSimplifyPass
+//===----------------------------------------------------------------------===//
+
+struct CIRSimplifyPass : public CIRSimplifyBase<CIRSimplifyPass> {
+  using CIRSimplifyBase::CIRSimplifyBase;
+
+  void runOnOperation() override;
+};
+
+void populateMergeCleanupPatterns(RewritePatternSet &patterns) {
+  // clang-format off
+  patterns.add<
+    SimplifyTernary,
+    SimplifySelect
+  >(patterns.getContext());
+  // clang-format on
+}
+
+void CIRSimplifyPass::runOnOperation() {
+  // Collect rewrite patterns.
+  RewritePatternSet patterns(&getContext());
+  populateMergeCleanupPatterns(patterns);
+
+  // Collect operations to apply patterns.
+  llvm::SmallVector<Operation *, 16> ops;
+  getOperation()->walk([&](Operation *op) {
+    if (isa<TernaryOp, SelectOp>(op))
+      ops.push_back(op);
+  });
+
+  // Apply patterns.
+  if (applyOpPatternsGreedily(ops, std::move(patterns)).failed())
+    signalPassFailure();
+}
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::createCIRSimplifyPass() {
+  return std::make_unique<CIRSimplifyPass>();
+}
diff --git a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
index 4678435..4dece5b 100644
--- a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_clang_library(MLIRCIRTransforms
   CIRCanonicalize.cpp
+  CIRSimplify.cpp
   FlattenCFG.cpp
   HoistAllocas.cpp
 
diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
index a32e6a7..cc65c93 100644
--- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
+++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp
@@ -62,15 +62,16 @@ class CIRGenConsumer : public clang::ASTConsumer {
   IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;
   std::unique_ptr<CIRGenerator> Gen;
   const FrontendOptions &FEOptions;
+  CodeGenOptions &CGO;
 
 public:
   CIRGenConsumer(CIRGenAction::OutputType Action, CompilerInstance &CI,
-                 std::unique_ptr<raw_pwrite_stream> OS)
+                 CodeGenOptions &CGO, std::unique_ptr<raw_pwrite_stream> OS)
       : Action(Action), CI(CI), OutputStream(std::move(OS)),
         FS(&CI.getVirtualFileSystem()),
         Gen(std::make_unique<CIRGenerator>(CI.getDiagnostics(), std::move(FS),
                                            CI.getCodeGenOpts())),
-        FEOptions(CI.getFrontendOpts()) {}
+        FEOptions(CI.getFrontendOpts()), CGO(CGO) {}
 
   void Initialize(ASTContext &Ctx) override {
     assert(!Context && "initialized multiple times");
@@ -102,7 +103,8 @@ public:
     if (!FEOptions.ClangIRDisablePasses) {
       // Setup and run CIR pipeline.
       if (runCIRToCIRPasses(MlirModule, MlirCtx, C,
-                            !FEOptions.ClangIRDisableCIRVerifier)
+                            !FEOptions.ClangIRDisableCIRVerifier,
+                            CGO.OptimizationLevel > 0)
               .failed()) {
         CI.getDiagnostics().Report(diag::err_cir_to_cir_transform_failed);
         return;
@@ -168,8 +170,8 @@ CIRGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   if (!Out)
     Out = getOutputStream(CI, InFile, Action);
 
-  auto Result =
-      std::make_unique<cir::CIRGenConsumer>(Action, CI, std::move(Out));
+  auto Result = std::make_unique<cir::CIRGenConsumer>(
+      Action, CI, CI.getCodeGenOpts(), std::move(Out));
 
   return Result;
 }
diff --git a/clang/lib/CIR/Lowering/CIRPasses.cpp b/clang/lib/CIR/Lowering/CIRPasses.cpp
index a37a048..7a58193 100644
--- a/clang/lib/CIR/Lowering/CIRPasses.cpp
+++ b/clang/lib/CIR/Lowering/CIRPasses.cpp
@@ -20,13 +20,17 @@ namespace cir {
 mlir::LogicalResult runCIRToCIRPasses(mlir::ModuleOp theModule,
                                       mlir::MLIRContext &mlirContext,
                                       clang::ASTContext &astContext,
-                                      bool enableVerifier) {
+                                      bool enableVerifier,
+                                      bool enableCIRSimplify) {
 
   llvm::TimeTraceScope scope("CIR To CIR Passes");
 
   mlir::PassManager pm(&mlirContext);
   pm.addPass(mlir::createCIRCanonicalizePass());
 
+  if (enableCIRSimplify)
+    pm.addPass(mlir::createCIRSimplifyPass());
+
   pm.enableVerifier(enableVerifier);
   (void)mlir::applyPassManagerCLOptions(pm);
   return pm.run(theModule);
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index c9ceb49..42c5937 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -255,6 +255,7 @@ getSancovOptsFromCGOpts(const CodeGenOptions &CGOpts) {
   Opts.InlineBoolFlag = CGOpts.SanitizeCoverageInlineBoolFlag;
   Opts.PCTable = CGOpts.SanitizeCoveragePCTable;
   Opts.StackDepth = CGOpts.SanitizeCoverageStackDepth;
+  Opts.StackDepthCallbackMin = CGOpts.SanitizeCoverageStackDepthCallbackMin;
   Opts.TraceLoads = CGOpts.SanitizeCoverageTraceLoads;
   Opts.TraceStores = CGOpts.SanitizeCoverageTraceStores;
   Opts.CollectControlFlow = CGOpts.SanitizeCoverageControlFlow;
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 82a24f7..2f1c769 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -672,10 +672,10 @@ arrangeFreeFunctionLikeCall(CodeGenTypes &CGT, CodeGenModule &CGM,
       addExtParameterInfosForCall(paramInfos, proto, numExtraRequiredArgs,
                                   args.size());
 
-    // If we don't have a prototype at all, but we're supposed to
-    // explicitly use the variadic convention for unprototyped calls,
-    // treat all of the arguments as required but preserve the nominal
-    // possibility of variadics.
+  // If we don't have a prototype at all, but we're supposed to
+  // explicitly use the variadic convention for unprototyped calls,
+  // treat all of the arguments as required but preserve the nominal
+  // possibility of variadics.
   } else if (CGM.getTargetCodeGenInfo().isNoProtoCallVariadic(
                  args, cast<FunctionNoProtoType>(fnType))) {
     required = RequiredArgs(args.size());
@@ -4061,7 +4061,7 @@ void CodeGenFunction::EmitFunctionEpilog(const CGFunctionInfo &FI,
     if (results.size() == 1) {
       RV = results[0];
 
-      // Otherwise, we need to make a first-class aggregate.
+    // Otherwise, we need to make a first-class aggregate.
     } else {
       // Construct a return type that lacks padding elements.
       llvm::Type *returnType = RetAI.getUnpaddedCoerceAndExpandType();
@@ -4200,11 +4200,11 @@ void CodeGenFunction::EmitDelegateCallArg(CallArgList &args,
   if (type->isReferenceType()) {
     args.add(RValue::get(Builder.CreateLoad(local)), type);
 
-    // In ARC, move out of consumed arguments so that the release cleanup
-    // entered by StartFunction doesn't cause an over-release.  This isn't
-    // optimal -O0 code generation, but it should get cleaned up when
-    // optimization is enabled.  This also assumes that delegate calls are
-    // performed exactly once for a set of arguments, but that should be safe.
+  // In ARC, move out of consumed arguments so that the release cleanup
+  // entered by StartFunction doesn't cause an over-release.  This isn't
+  // optimal -O0 code generation, but it should get cleaned up when
+  // optimization is enabled.  This also assumes that delegate calls are
+  // performed exactly once for a set of arguments, but that should be safe.
   } else if (getLangOpts().ObjCAutoRefCount &&
              param->hasAttr<NSConsumedAttr>() && type->isObjCRetainableType()) {
     llvm::Value *ptr = Builder.CreateLoad(local);
@@ -4213,8 +4213,8 @@ void CodeGenFunction::EmitDelegateCallArg(CallArgList &args,
     Builder.CreateStore(null, local);
     args.add(RValue::get(ptr), type);
 
-    // For the most part, we just need to load the alloca, except that
-    // aggregate r-values are actually pointers to temporaries.
+  // For the most part, we just need to load the alloca, except that
+  // aggregate r-values are actually pointers to temporaries.
   } else {
     args.add(convertTempToRValue(local, type, loc), type);
   }
@@ -4306,7 +4306,7 @@ static void emitWriteback(CodeGenFunction &CGF,
     // Release the old value.
     CGF.EmitARCRelease(oldValue, srcLV.isARCPreciseLifetime());
 
-    // Otherwise, we can just do a normal lvalue store.
+  // Otherwise, we can just do a normal lvalue store.
   } else {
     CGF.EmitStoreThroughLValue(RValue::get(value), srcLV);
   }
@@ -4347,7 +4347,7 @@ static void emitWritebackArg(CodeGenFunction &CGF, CallArgList &args,
   if (const Expr *lvExpr = maybeGetUnaryAddrOfOperand(CRE->getSubExpr())) {
     srcLV = CGF.EmitLValue(lvExpr);
 
-    // Otherwise, just emit it as a scalar.
+  // Otherwise, just emit it as a scalar.
   } else {
     Address srcAddr = CGF.EmitPointerWithAlignment(CRE->getSubExpr());
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 2e01adc..6f5ead7 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1228,7 +1228,11 @@ void CodeGenFunction::EmitBoundsCheckImpl(const Expr *E, llvm::Value *Bound,
   SanitizerScope SanScope(this);
 
   llvm::DILocation *CheckDI = Builder.getCurrentDebugLocation();
-  if (ClArrayBoundsPseudoFn && CheckDI) {
+  auto CheckKind = SanitizerKind::SO_ArrayBounds;
+  // TODO: deprecate ClArrayBoundsPseudoFn
+  if ((ClArrayBoundsPseudoFn ||
+       CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo.has(CheckKind)) &&
+      CheckDI) {
     CheckDI = getDebugInfo()->CreateSyntheticInlineAt(
         Builder.getCurrentDebugLocation(), "__ubsan_check_array_bounds");
   }
@@ -1245,8 +1249,8 @@ void CodeGenFunction::EmitBoundsCheckImpl(const Expr *E, llvm::Value *Bound,
   };
   llvm::Value *Check = Accessed ? Builder.CreateICmpULT(IndexVal, BoundVal)
                                 : Builder.CreateICmpULE(IndexVal, BoundVal);
-  EmitCheck(std::make_pair(Check, SanitizerKind::SO_ArrayBounds),
-            SanitizerHandler::OutOfBounds, StaticData, Index);
+  EmitCheck(std::make_pair(Check, CheckKind), SanitizerHandler::OutOfBounds,
+            StaticData, Index);
 }
 
 CodeGenFunction::ComplexPairTy CodeGenFunction::
diff --git a/clang/lib/CodeGen/CGPointerAuth.cpp b/clang/lib/CodeGen/CGPointerAuth.cpp
index 0a183a8..474848c 100644
--- a/clang/lib/CodeGen/CGPointerAuth.cpp
+++ b/clang/lib/CodeGen/CGPointerAuth.cpp
@@ -724,7 +724,6 @@ Address Address::getResignedAddress(const CGPointerAuthInfo &NewInfo,
     Val = CGF.emitPointerAuthResign(getBasePointer(), QualType(), CurInfo,
                                     NewInfo, isKnownNonNull());
 
-  Val = CGF.Builder.CreateBitCast(Val, getType());
   return Address(Val, getElementType(), getAlignment(), NewInfo,
                  /*Offset=*/nullptr, isKnownNonNull());
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b36e078..c278176 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6172,6 +6172,22 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
   CodeGenFunction(*this).GenerateCode(GD, Fn, FI);
 
   setNonAliasAttributes(GD, Fn);
+
+  bool ShouldAddOptNone = !CodeGenOpts.DisableO0ImplyOptNone &&
+                          (CodeGenOpts.OptimizationLevel == 0) &&
+                          !D->hasAttr<MinSizeAttr>();
+
+  if (D->hasAttr<OpenCLKernelAttr>()) {
+    if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub &&
+        !D->hasAttr<NoInlineAttr>() &&
+        !Fn->hasFnAttribute(llvm::Attribute::NoInline) &&
+        !D->hasAttr<OptimizeNoneAttr>() &&
+        !Fn->hasFnAttribute(llvm::Attribute::OptimizeNone) &&
+        !ShouldAddOptNone) {
+      Fn->addFnAttr(llvm::Attribute::AlwaysInline);
+    }
+  }
+
   SetLLVMFunctionAttributesForDefinition(D, Fn);
 
   if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index ff08bff..6ff45d1 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -76,6 +76,7 @@ static const SanitizerMask MergeDefault =
     SanitizerKind::Undefined | SanitizerKind::Vptr;
 static const SanitizerMask TrappingDefault =
     SanitizerKind::CFI | SanitizerKind::LocalBounds;
+static const SanitizerMask AnnotateDebugInfoDefault;
 static const SanitizerMask CFIClasses =
     SanitizerKind::CFIVCall | SanitizerKind::CFINVCall |
     SanitizerKind::CFIMFCall | SanitizerKind::CFIDerivedCast |
@@ -738,6 +739,13 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
   // Parse -fno-sanitize-top-hot flags
   SkipHotCutoffs = parseSanitizeSkipHotCutoffArgs(D, Args, DiagnoseErrors);
 
+  // Parse -f(no-)?sanitize-annotate-debug-info flags
+  SanitizerMask AnnotateDebugInfoKinds =
+      parseSanitizeArgs(D, Args, DiagnoseErrors, AnnotateDebugInfoDefault, {},
+                        {}, options::OPT_fsanitize_annotate_debug_info_EQ,
+                        options::OPT_fno_sanitize_annotate_debug_info_EQ);
+  AnnotateDebugInfoKinds &= Kinds;
+
   // Setup ignorelist files.
   // Add default ignorelist from resource directory for activated sanitizers,
   // and validate special case lists format.
@@ -751,6 +759,17 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
       options::OPT_fno_sanitize_ignorelist,
       clang::diag::err_drv_malformed_sanitizer_ignorelist, DiagnoseErrors);
 
+  // Verify that -fsanitize-coverage-stack-depth-callback-min is >= 0.
+  if (Arg *A = Args.getLastArg(
+          options::OPT_fsanitize_coverage_stack_depth_callback_min_EQ)) {
+    StringRef S = A->getValue();
+    if (S.getAsInteger(0, CoverageStackDepthCallbackMin) ||
+        CoverageStackDepthCallbackMin < 0) {
+      if (DiagnoseErrors)
+        D.Diag(clang::diag::err_drv_invalid_value) << A->getAsString(Args) << S;
+    }
+  }
+
   // Parse -f[no-]sanitize-memory-track-origins[=level] options.
   if (AllAddedKinds & SanitizerKind::Memory) {
     if (Arg *A =
@@ -1157,6 +1176,8 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
 
   MergeHandlers.Mask |= MergeKinds;
 
+  AnnotateDebugInfo.Mask |= AnnotateDebugInfoKinds;
+
   // Zero out SkipHotCutoffs for unused sanitizers
   SkipHotCutoffs.clear(~Sanitizers.Mask);
 }
@@ -1269,6 +1290,11 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
   addSpecialCaseListOpt(Args, CmdArgs, "-fsanitize-coverage-ignorelist=",
                         CoverageIgnorelistFiles);
 
+  if (CoverageStackDepthCallbackMin)
+    CmdArgs.push_back(
+        Args.MakeArgString("-fsanitize-coverage-stack-depth-callback-min=" +
+                           Twine(CoverageStackDepthCallbackMin)));
+
   if (!GPUSanitize) {
     // Translate available BinaryMetadataFeatures to corresponding clang-cc1
     // flags. Does not depend on any other sanitizers. Unsupported on GPUs.
@@ -1335,6 +1361,10 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
     CmdArgs.push_back(
         Args.MakeArgString("-fsanitize-skip-hot-cutoff=" + SkipHotCutoffsStr));
 
+  if (!AnnotateDebugInfo.empty())
+    CmdArgs.push_back(Args.MakeArgString("-fsanitize-annotate-debug-info=" +
+                                         toString(AnnotateDebugInfo)));
+
   addSpecialCaseListOpt(Args, CmdArgs,
                         "-fsanitize-ignorelist=", UserIgnorelistFiles);
   addSpecialCaseListOpt(Args, CmdArgs,
@@ -1518,7 +1548,10 @@ SanitizerMask parseArgValues(const Driver &D, const llvm::opt::Arg *A,
        A->getOption().matches(options::OPT_fsanitize_trap_EQ) ||
        A->getOption().matches(options::OPT_fno_sanitize_trap_EQ) ||
        A->getOption().matches(options::OPT_fsanitize_merge_handlers_EQ) ||
-       A->getOption().matches(options::OPT_fno_sanitize_merge_handlers_EQ)) &&
+       A->getOption().matches(options::OPT_fno_sanitize_merge_handlers_EQ) ||
+       A->getOption().matches(options::OPT_fsanitize_annotate_debug_info_EQ) ||
+       A->getOption().matches(
+           options::OPT_fno_sanitize_annotate_debug_info_EQ)) &&
       "Invalid argument in parseArgValues!");
   SanitizerMask Kinds;
   for (int i = 0, n = A->getNumValues(); i != n; ++i) {
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 9c817f2..960ee7f 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -255,6 +255,12 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     D.Diag(diag::err_drv_unsupported_noabicalls_pic);
   }
 
+  if (CPUName == "i6500" || CPUName == "i6400") {
+    // MIPS cpu i6400 and i6500 support MSA (Mips SIMD Architecture)
+    // by default.
+    Features.push_back("+msa");
+  }
+
   if (!UseAbiCalls)
     Features.push_back("+noabicalls");
   else
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e4bad39..5c1bc09 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -934,7 +934,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
     std::optional<StringRef> OptVal =
         llvm::StringSwitch<std::optional<StringRef>>(ArgVecLib->getValue())
             .Case("Accelerate", "Accelerate")
-            .Case("libmvec", "LIBMVEC-X86")
+            .Case("libmvec", "LIBMVEC")
             .Case("MASSV", "MASSV")
             .Case("SVML", "SVML")
             .Case("SLEEF", "sleefgnuabi")
@@ -2938,8 +2938,7 @@ void tools::addMCModel(const Driver &D, const llvm::opt::ArgList &Args,
       Ok = CM == "small" || CM == "medium" ||
            (CM == "large" && Triple.isRISCV64());
     } else if (Triple.getArch() == llvm::Triple::x86_64) {
-      Ok = llvm::is_contained({"small", "kernel", "medium", "large", "tiny"},
-                              CM);
+      Ok = llvm::is_contained({"small", "kernel", "medium", "large"}, CM);
     } else if (Triple.isNVPTX() || Triple.isAMDGPU() || Triple.isSPIRV()) {
       // NVPTX/AMDGPU/SPIRV does not care about the code model and will accept
       // whatever works for the host.
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index caf386c..ac6551b2 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1422,8 +1422,9 @@ private:
       } else if (CurrentToken && CurrentToken->is(tok::numeric_constant)) {
         Tok->setType(TT_BitFieldColon);
       } else if (Contexts.size() == 1 &&
-                 !Line.First->isOneOf(tok::kw_enum, tok::kw_case,
-                                      tok::kw_default)) {
+                 !Line.getFirstNonComment()->isOneOf(tok::kw_enum, tok::kw_case,
+                                                     tok::kw_default) &&
+                 !Line.startsWith(tok::kw_typedef, tok::kw_enum)) {
         FormatToken *Prev = Tok->getPreviousNonComment();
         if (!Prev)
           break;
@@ -1719,22 +1720,12 @@ private:
         break;
       }
       if (Style.isCSharp()) {
-        // `Type?)`, `Type?>`, `Type? name;` and `Type? name =` can only be
+        // `Type?)`, `Type?>`, `Type? name;`, and `Type? name =` can only be
         // nullable types.
-
-        // `Type?)`, `Type?>`, `Type? name;`
-        if (Tok->Next &&
-            (Tok->Next->startsSequence(tok::question, tok::r_paren) ||
-             Tok->Next->startsSequence(tok::question, tok::greater) ||
-             Tok->Next->startsSequence(tok::question, tok::identifier,
-                                       tok::semi))) {
-          Tok->setType(TT_CSharpNullable);
-          break;
-        }
-
-        // `Type? name =`
-        if (Tok->Next && Tok->Next->is(tok::identifier) && Tok->Next->Next &&
-            Tok->Next->Next->is(tok::equal)) {
+        if (const auto *Next = Tok->getNextNonComment();
+            Next && (Next->isOneOf(tok::r_paren, tok::greater) ||
+                     Next->startsSequence(tok::identifier, tok::semi) ||
+                     Next->startsSequence(tok::identifier, tok::equal))) {
           Tok->setType(TT_CSharpNullable);
           break;
         }
@@ -3093,10 +3084,12 @@ private:
     if (InTemplateArgument && NextToken->Tok.isAnyIdentifier())
       return TT_BinaryOperator;
 
-    // "&&" followed by "*" or "&" is quite unlikely to be two successive unary
-    // "&".
-    if (Tok.is(tok::ampamp) && NextToken->isOneOf(tok::star, tok::amp))
+    // "&&" followed by "(", "*", or "&" is quite unlikely to be two successive
+    // unary "&".
+    if (Tok.is(tok::ampamp) &&
+        NextToken->isOneOf(tok::l_paren, tok::star, tok::amp)) {
       return TT_BinaryOperator;
+    }
 
     // This catches some cases where evaluation order is used as control flow:
     //   aaa && aaa->f();
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 9b4147f..b59496b 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -580,13 +580,13 @@ struct ReadModuleNames : ASTReaderListener {
     ModuleMap &MM = PP.getHeaderSearchInfo().getModuleMap();
     for (const std::string &LoadedModule : LoadedModules)
       MM.cacheModuleLoad(*PP.getIdentifierInfo(LoadedModule),
-                         MM.findModule(LoadedModule));
+                         MM.findOrLoadModule(LoadedModule));
     LoadedModules.clear();
   }
 
   void markAllUnavailable() {
     for (const std::string &LoadedModule : LoadedModules) {
-      if (Module *M = PP.getHeaderSearchInfo().getModuleMap().findModule(
+      if (Module *M = PP.getHeaderSearchInfo().getModuleMap().findOrLoadModule(
               LoadedModule)) {
         M->HasIncompatibleModuleFile = true;
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index c7d11e6..a0b8bbf 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1838,6 +1838,10 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
   for (std::string Sanitizer : Values)
     GenerateArg(Consumer, OPT_fsanitize_skip_hot_cutoff_EQ, Sanitizer);
 
+  for (StringRef Sanitizer :
+       serializeSanitizerKinds(Opts.SanitizeAnnotateDebugInfo))
+    GenerateArg(Consumer, OPT_fsanitize_annotate_debug_info_EQ, Sanitizer);
+
   if (!Opts.EmitVersionIdentMetadata)
     GenerateArg(Consumer, OPT_Qn);
 
@@ -2332,6 +2336,11 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
       "-fsanitize-skip-hot-cutoff=",
       Args.getAllArgValues(OPT_fsanitize_skip_hot_cutoff_EQ), Diags);
 
+  parseSanitizerKinds(
+      "-fsanitize-annotate-debug-info=",
+      Args.getAllArgValues(OPT_fsanitize_annotate_debug_info_EQ), Diags,
+      Opts.SanitizeAnnotateDebugInfo);
+
   Opts.EmitVersionIdentMetadata = Args.hasFlag(OPT_Qy, OPT_Qn, true);
 
   if (!LangOpts->CUDAIsDevice)
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 9b2aa25..f09eb98 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -621,8 +621,8 @@ static bool loadModuleMapForModuleBuild(CompilerInstance &CI, bool IsSystem,
   }
 
   // Load the module map file.
-  if (HS.loadModuleMapFile(*ModuleMap, IsSystem, ModuleMapID, &Offset,
-                           PresumedModuleMapFile))
+  if (HS.parseAndLoadModuleMapFile(*ModuleMap, IsSystem, ModuleMapID, &Offset,
+                                   PresumedModuleMapFile))
     return true;
 
   if (SrcMgr.getBufferOrFake(ModuleMapID).getBufferSize() == Offset)
@@ -1077,8 +1077,8 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
   // If we were asked to load any module map files, do so now.
   for (const auto &Filename : CI.getFrontendOpts().ModuleMapFiles) {
     if (auto File = CI.getFileManager().getOptionalFileRef(Filename))
-      CI.getPreprocessor().getHeaderSearchInfo().loadModuleMapFile(
-          *File, /*IsSystem*/false);
+      CI.getPreprocessor().getHeaderSearchInfo().parseAndLoadModuleMapFile(
+          *File, /*IsSystem*/ false);
     else
       CI.getDiagnostics().Report(diag::err_module_map_not_found) << Filename;
   }
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 906b0fa..96d6fb6 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -618,8 +618,10 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
     if (LangOpts.HIPStdPar) {
       Builder.defineMacro("__HIPSTDPAR__");
-      if (LangOpts.HIPStdParInterposeAlloc)
+      if (LangOpts.HIPStdParInterposeAlloc) {
         Builder.defineMacro("__HIPSTDPAR_INTERPOSE_ALLOC__");
+        Builder.defineMacro("__HIPSTDPAR_INTERPOSE_ALLOC_V1__");
+      }
     }
     if (LangOpts.CUDAIsDevice) {
       Builder.defineMacro("__HIP_DEVICE_COMPILE__");
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index d308cc9..7afc824 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -264,9 +264,10 @@ __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
   uint64_t __match_mask = 0;
 
   bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
+  for (uint64_t __active_mask = __lane_mask; __active_mask;
+       __active_mask = __gpu_ballot(__lane_mask, !__done)) {
     if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      uint32_t __first = __gpu_read_first_lane_u32(__active_mask, __x);
       if (__first == __x) {
         __match_mask = __gpu_lane_mask();
         __done = 1;
@@ -283,9 +284,10 @@ __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
   uint64_t __match_mask = 0;
 
   bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
+  for (uint64_t __active_mask = __lane_mask; __active_mask;
+       __active_mask = __gpu_ballot(__lane_mask, !__done)) {
     if (!__done) {
-      uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+      uint64_t __first = __gpu_read_first_lane_u64(__active_mask, __x);
       if (__first == __x) {
         __match_mask = __gpu_lane_mask();
         __done = 1;
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index 2665580..ea2391f 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -299,7 +299,7 @@ Module *HeaderSearch::lookupModule(StringRef ModuleName,
                                    SourceLocation ImportLoc, bool AllowSearch,
                                    bool AllowExtraModuleMapSearch) {
   // Look in the module map to determine if there is a module by this name.
-  Module *Module = ModMap.findModule(ModuleName);
+  Module *Module = ModMap.findOrLoadModule(ModuleName);
   if (Module || !AllowSearch || !HSOpts.ImplicitModuleMaps)
     return Module;
 
@@ -359,11 +359,11 @@ Module *HeaderSearch::lookupModule(StringRef ModuleName, StringRef SearchName,
     // checked
     DirectoryEntryRef NormalDir = *Dir.getDirRef();
     // Search for a module map file in this directory.
-    if (loadModuleMapFile(NormalDir, IsSystem,
-                          /*IsFramework*/false) == LMM_NewlyLoaded) {
-      // We just loaded a module map file; check whether the module is
-      // available now.
-      Module = ModMap.findModule(ModuleName);
+    if (parseModuleMapFile(NormalDir, IsSystem,
+                           /*IsFramework*/ false) == MMR_NewlyProcessed) {
+      // We just parsed a module map file; check whether the module can be
+      // loaded now.
+      Module = ModMap.findOrLoadModule(ModuleName);
       if (Module)
         break;
     }
@@ -373,10 +373,10 @@ Module *HeaderSearch::lookupModule(StringRef ModuleName, StringRef SearchName,
     SmallString<128> NestedModuleMapDirName;
     NestedModuleMapDirName = Dir.getDirRef()->getName();
     llvm::sys::path::append(NestedModuleMapDirName, ModuleName);
-    if (loadModuleMapFile(NestedModuleMapDirName, IsSystem,
-                          /*IsFramework*/false) == LMM_NewlyLoaded){
-      // If we just loaded a module map file, look for the module again.
-      Module = ModMap.findModule(ModuleName);
+    if (parseModuleMapFile(NestedModuleMapDirName, IsSystem,
+                           /*IsFramework*/ false) == MMR_NewlyProcessed) {
+      // If we just parsed a module map file, look for the module again.
+      Module = ModMap.findOrLoadModule(ModuleName);
       if (Module)
         break;
     }
@@ -393,7 +393,7 @@ Module *HeaderSearch::lookupModule(StringRef ModuleName, StringRef SearchName,
         loadSubdirectoryModuleMaps(Dir);
 
       // Look again for the module.
-      Module = ModMap.findModule(ModuleName);
+      Module = ModMap.findOrLoadModule(ModuleName);
       if (Module)
         break;
     }
@@ -1559,7 +1559,7 @@ bool HeaderSearch::hasModuleMap(StringRef FileName,
   if (!HSOpts.ImplicitModuleMaps)
     return false;
 
-  SmallVector<const DirectoryEntry *, 2> FixUpDirectories;
+  SmallVector<DirectoryEntryRef, 2> FixUpDirectories;
 
   StringRef DirName = FileName;
   do {
@@ -1574,19 +1574,20 @@ bool HeaderSearch::hasModuleMap(StringRef FileName,
       return false;
 
     // Try to load the module map file in this directory.
-    switch (loadModuleMapFile(*Dir, IsSystem,
-                              llvm::sys::path::extension(Dir->getName()) ==
-                                  ".framework")) {
-    case LMM_NewlyLoaded:
-    case LMM_AlreadyLoaded:
+    switch (parseAndLoadModuleMapFile(
+        *Dir, IsSystem,
+        llvm::sys::path::extension(Dir->getName()) == ".framework")) {
+    case MMR_NewlyProcessed:
+    case MMR_AlreadyProcessed: {
       // Success. All of the directories we stepped through inherit this module
       // map file.
+      const ModuleMapDirectoryState &MMDS = DirectoryModuleMap[*Dir];
       for (unsigned I = 0, N = FixUpDirectories.size(); I != N; ++I)
-        DirectoryHasModuleMap[FixUpDirectories[I]] = true;
+        DirectoryModuleMap[FixUpDirectories[I]] = MMDS;
       return true;
-
-    case LMM_NoDirectory:
-    case LMM_InvalidModuleMap:
+    }
+    case MMR_NoDirectory:
+    case MMR_InvalidModuleMap:
       break;
     }
 
@@ -1706,7 +1707,8 @@ bool HeaderSearch::findUsableModuleForFrameworkHeader(
 
 static OptionalFileEntryRef getPrivateModuleMap(FileEntryRef File,
                                                 FileManager &FileMgr,
-                                                DiagnosticsEngine &Diags) {
+                                                DiagnosticsEngine &Diags,
+                                                bool Diagnose = true) {
   StringRef Filename = llvm::sys::path::filename(File.getName());
   SmallString<128>  PrivateFilename(File.getDir().getName());
   if (Filename == "module.map")
@@ -1717,7 +1719,7 @@ static OptionalFileEntryRef getPrivateModuleMap(FileEntryRef File,
     return std::nullopt;
   auto PMMFile = FileMgr.getOptionalFileRef(PrivateFilename);
   if (PMMFile) {
-    if (Filename == "module.map")
+    if (Diagnose && Filename == "module.map")
       Diags.Report(diag::warn_deprecated_module_dot_map)
           << PrivateFilename << 1
           << File.getDir().getName().ends_with(".framework");
@@ -1725,9 +1727,9 @@ static OptionalFileEntryRef getPrivateModuleMap(FileEntryRef File,
   return PMMFile;
 }
 
-bool HeaderSearch::loadModuleMapFile(FileEntryRef File, bool IsSystem,
-                                     FileID ID, unsigned *Offset,
-                                     StringRef OriginalModuleMapFile) {
+bool HeaderSearch::parseAndLoadModuleMapFile(FileEntryRef File, bool IsSystem,
+                                             FileID ID, unsigned *Offset,
+                                             StringRef OriginalModuleMapFile) {
   // Find the directory for the module. For frameworks, that may require going
   // up from the 'Modules' directory.
   OptionalDirectoryEntryRef Dir;
@@ -1761,43 +1763,72 @@ bool HeaderSearch::loadModuleMapFile(FileEntryRef File, bool IsSystem,
   }
 
   assert(Dir && "module map home directory must exist");
-  switch (loadModuleMapFileImpl(File, IsSystem, *Dir, ID, Offset)) {
-  case LMM_AlreadyLoaded:
-  case LMM_NewlyLoaded:
+  switch (parseAndLoadModuleMapFileImpl(File, IsSystem, *Dir, ID, Offset)) {
+  case MMR_AlreadyProcessed:
+  case MMR_NewlyProcessed:
     return false;
-  case LMM_NoDirectory:
-  case LMM_InvalidModuleMap:
+  case MMR_NoDirectory:
+  case MMR_InvalidModuleMap:
     return true;
   }
   llvm_unreachable("Unknown load module map result");
 }
 
-HeaderSearch::LoadModuleMapResult
-HeaderSearch::loadModuleMapFileImpl(FileEntryRef File, bool IsSystem,
-                                    DirectoryEntryRef Dir, FileID ID,
-                                    unsigned *Offset) {
+HeaderSearch::ModuleMapResult
+HeaderSearch::parseAndLoadModuleMapFileImpl(FileEntryRef File, bool IsSystem,
+                                            DirectoryEntryRef Dir, FileID ID,
+                                            unsigned *Offset) {
   // Check whether we've already loaded this module map, and mark it as being
   // loaded in case we recursively try to load it from itself.
   auto AddResult = LoadedModuleMaps.insert(std::make_pair(File, true));
   if (!AddResult.second)
-    return AddResult.first->second ? LMM_AlreadyLoaded : LMM_InvalidModuleMap;
+    return AddResult.first->second ? MMR_AlreadyProcessed
+                                   : MMR_InvalidModuleMap;
 
-  if (ModMap.loadModuleMapFile(File, IsSystem, Dir, ID, Offset)) {
+  if (ModMap.parseAndLoadModuleMapFile(File, IsSystem, Dir, ID, Offset)) {
     LoadedModuleMaps[File] = false;
-    return LMM_InvalidModuleMap;
+    return MMR_InvalidModuleMap;
   }
 
   // Try to load a corresponding private module map.
   if (OptionalFileEntryRef PMMFile =
-          getPrivateModuleMap(File, FileMgr, Diags)) {
-    if (ModMap.loadModuleMapFile(*PMMFile, IsSystem, Dir)) {
+          getPrivateModuleMap(File, FileMgr, Diags, !ParsedModuleMaps[File])) {
+    if (ModMap.parseAndLoadModuleMapFile(*PMMFile, IsSystem, Dir)) {
       LoadedModuleMaps[File] = false;
-      return LMM_InvalidModuleMap;
+      return MMR_InvalidModuleMap;
+    }
+  }
+
+  // This directory has a module map.
+  return MMR_NewlyProcessed;
+}
+
+HeaderSearch::ModuleMapResult
+HeaderSearch::parseModuleMapFileImpl(FileEntryRef File, bool IsSystem,
+                                     DirectoryEntryRef Dir, FileID ID) {
+  // Check whether we've already parsed this module map, and mark it as being
+  // parsed in case we recursively try to parse it from itself.
+  auto AddResult = ParsedModuleMaps.insert(std::make_pair(File, true));
+  if (!AddResult.second)
+    return AddResult.first->second ? MMR_AlreadyProcessed
+                                   : MMR_InvalidModuleMap;
+
+  if (ModMap.parseModuleMapFile(File, IsSystem, Dir, ID)) {
+    ParsedModuleMaps[File] = false;
+    return MMR_InvalidModuleMap;
+  }
+
+  // Try to parse a corresponding private module map.
+  if (OptionalFileEntryRef PMMFile =
+          getPrivateModuleMap(File, FileMgr, Diags)) {
+    if (ModMap.parseModuleMapFile(*PMMFile, IsSystem, Dir)) {
+      ParsedModuleMaps[File] = false;
+      return MMR_InvalidModuleMap;
     }
   }
 
   // This directory has a module map.
-  return LMM_NewlyLoaded;
+  return MMR_NewlyProcessed;
 }
 
 OptionalFileEntryRef
@@ -1837,54 +1868,109 @@ HeaderSearch::lookupModuleMapFile(DirectoryEntryRef Dir, bool IsFramework) {
 Module *HeaderSearch::loadFrameworkModule(StringRef Name, DirectoryEntryRef Dir,
                                           bool IsSystem) {
   // Try to load a module map file.
-  switch (loadModuleMapFile(Dir, IsSystem, /*IsFramework*/true)) {
-  case LMM_InvalidModuleMap:
+  switch (parseAndLoadModuleMapFile(Dir, IsSystem, /*IsFramework*/ true)) {
+  case MMR_InvalidModuleMap:
     // Try to infer a module map from the framework directory.
     if (HSOpts.ImplicitModuleMaps)
       ModMap.inferFrameworkModule(Dir, IsSystem, /*Parent=*/nullptr);
     break;
 
-  case LMM_NoDirectory:
+  case MMR_NoDirectory:
     return nullptr;
 
-  case LMM_AlreadyLoaded:
-  case LMM_NewlyLoaded:
+  case MMR_AlreadyProcessed:
+  case MMR_NewlyProcessed:
     break;
   }
 
-  return ModMap.findModule(Name);
+  return ModMap.findOrLoadModule(Name);
 }
 
-HeaderSearch::LoadModuleMapResult
-HeaderSearch::loadModuleMapFile(StringRef DirName, bool IsSystem,
-                                bool IsFramework) {
+HeaderSearch::ModuleMapResult
+HeaderSearch::parseAndLoadModuleMapFile(StringRef DirName, bool IsSystem,
+                                        bool IsFramework) {
   if (auto Dir = FileMgr.getOptionalDirectoryRef(DirName))
-    return loadModuleMapFile(*Dir, IsSystem, IsFramework);
+    return parseAndLoadModuleMapFile(*Dir, IsSystem, IsFramework);
 
-  return LMM_NoDirectory;
+  return MMR_NoDirectory;
 }
 
-HeaderSearch::LoadModuleMapResult
-HeaderSearch::loadModuleMapFile(DirectoryEntryRef Dir, bool IsSystem,
-                                bool IsFramework) {
-  auto KnownDir = DirectoryHasModuleMap.find(Dir);
-  if (KnownDir != DirectoryHasModuleMap.end())
-    return KnownDir->second ? LMM_AlreadyLoaded : LMM_InvalidModuleMap;
+HeaderSearch::ModuleMapResult
+HeaderSearch::parseAndLoadModuleMapFile(DirectoryEntryRef Dir, bool IsSystem,
+                                        bool IsFramework) {
+  auto InsertRes = DirectoryModuleMap.insert(std::pair{
+      Dir, ModuleMapDirectoryState{{}, ModuleMapDirectoryState::Invalid}});
+  ModuleMapDirectoryState &MMState = InsertRes.first->second;
+  if (!InsertRes.second) {
+    switch (MMState.Status) {
+    case ModuleMapDirectoryState::Parsed:
+      break;
+    case ModuleMapDirectoryState::Loaded:
+      return MMR_AlreadyProcessed;
+    case ModuleMapDirectoryState::Invalid:
+      return MMR_InvalidModuleMap;
+    };
+  }
+
+  if (!MMState.ModuleMapFile)
+    MMState.ModuleMapFile = lookupModuleMapFile(Dir, IsFramework);
+
+  if (MMState.ModuleMapFile) {
+    ModuleMapResult Result =
+        parseAndLoadModuleMapFileImpl(*MMState.ModuleMapFile, IsSystem, Dir);
+    // Add Dir explicitly in case ModuleMapFile is in a subdirectory.
+    // E.g. Foo.framework/Modules/module.modulemap
+    //      ^Dir                  ^ModuleMapFile
+    if (Result == MMR_NewlyProcessed)
+      MMState.Status = ModuleMapDirectoryState::Loaded;
+    else if (Result == MMR_InvalidModuleMap)
+      MMState.Status = ModuleMapDirectoryState::Invalid;
+    return Result;
+  }
+  return MMR_InvalidModuleMap;
+}
+
+HeaderSearch::ModuleMapResult
+HeaderSearch::parseModuleMapFile(StringRef DirName, bool IsSystem,
+                                 bool IsFramework) {
+  if (auto Dir = FileMgr.getOptionalDirectoryRef(DirName))
+    return parseModuleMapFile(*Dir, IsSystem, IsFramework);
+
+  return MMR_NoDirectory;
+}
+
+HeaderSearch::ModuleMapResult
+HeaderSearch::parseModuleMapFile(DirectoryEntryRef Dir, bool IsSystem,
+                                 bool IsFramework) {
+  auto InsertRes = DirectoryModuleMap.insert(std::pair{
+      Dir, ModuleMapDirectoryState{{}, ModuleMapDirectoryState::Invalid}});
+  ModuleMapDirectoryState &MMState = InsertRes.first->second;
+  if (!InsertRes.second) {
+    switch (MMState.Status) {
+    case ModuleMapDirectoryState::Parsed:
+    case ModuleMapDirectoryState::Loaded:
+      return MMR_AlreadyProcessed;
+    case ModuleMapDirectoryState::Invalid:
+      return MMR_InvalidModuleMap;
+    };
+  }
+
+  if (!MMState.ModuleMapFile)
+    MMState.ModuleMapFile = lookupModuleMapFile(Dir, IsFramework);
 
-  if (OptionalFileEntryRef ModuleMapFile =
-          lookupModuleMapFile(Dir, IsFramework)) {
-    LoadModuleMapResult Result =
-        loadModuleMapFileImpl(*ModuleMapFile, IsSystem, Dir);
+  if (MMState.ModuleMapFile) {
+    ModuleMapResult Result =
+        parseModuleMapFileImpl(*MMState.ModuleMapFile, IsSystem, Dir);
     // Add Dir explicitly in case ModuleMapFile is in a subdirectory.
     // E.g. Foo.framework/Modules/module.modulemap
     //      ^Dir                  ^ModuleMapFile
-    if (Result == LMM_NewlyLoaded)
-      DirectoryHasModuleMap[Dir] = true;
-    else if (Result == LMM_InvalidModuleMap)
-      DirectoryHasModuleMap[Dir] = false;
+    if (Result == MMR_NewlyProcessed)
+      MMState.Status = ModuleMapDirectoryState::Parsed;
+    else if (Result == MMR_InvalidModuleMap)
+      MMState.Status = ModuleMapDirectoryState::Invalid;
     return Result;
   }
-  return LMM_InvalidModuleMap;
+  return MMR_InvalidModuleMap;
 }
 
 void HeaderSearch::collectAllModules(SmallVectorImpl<Module *> &Modules) {
@@ -1923,7 +2009,8 @@ void HeaderSearch::collectAllModules(SmallVectorImpl<Module *> &Modules) {
         continue;
 
       // Try to load a module map file for the search directory.
-      loadModuleMapFile(*DL.getDirRef(), IsSystem, /*IsFramework*/ false);
+      parseAndLoadModuleMapFile(*DL.getDirRef(), IsSystem,
+                                /*IsFramework*/ false);
 
       // Try to load module map files for immediate subdirectories of this
       // search directory.
@@ -1946,8 +2033,8 @@ void HeaderSearch::loadTopLevelSystemModules() {
       continue;
 
     // Try to load a module map file for the search directory.
-    loadModuleMapFile(*DL.getDirRef(), DL.isSystemHeaderDirectory(),
-                      DL.isFramework());
+    parseAndLoadModuleMapFile(*DL.getDirRef(), DL.isSystemHeaderDirectory(),
+                              DL.isFramework());
   }
 }
 
@@ -1970,8 +2057,9 @@ void HeaderSearch::loadSubdirectoryModuleMaps(DirectoryLookup &SearchDir) {
       continue;
     bool IsFramework = llvm::sys::path::extension(Dir->path()) == ".framework";
     if (IsFramework == SearchDir.isFramework())
-      loadModuleMapFile(Dir->path(), SearchDir.isSystemHeaderDirectory(),
-                        SearchDir.isFramework());
+      parseAndLoadModuleMapFile(Dir->path(),
+                                SearchDir.isSystemHeaderDirectory(),
+                                SearchDir.isFramework());
   }
 
   SearchDir.setSearchedAllModuleMaps(true);
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index c2f13fa..81a74d5 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -1051,7 +1051,9 @@ Module *ModuleMap::inferFrameworkModule(DirectoryEntryRef FrameworkDir,
           bool IsFrameworkDir = Parent.ends_with(".framework");
           if (OptionalFileEntryRef ModMapFile =
                   HeaderInfo.lookupModuleMapFile(*ParentDir, IsFrameworkDir)) {
-            loadModuleMapFile(*ModMapFile, Attrs.IsSystem, *ParentDir);
+            // TODO: Parsing a module map should populate `InferredDirectories`
+            //       so we don't need to do a full load here.
+            parseAndLoadModuleMapFile(*ModMapFile, Attrs.IsSystem, *ParentDir);
             inferred = InferredDirectories.find(*ParentDir);
           }
 
@@ -1320,6 +1322,83 @@ void ModuleMap::addHeader(Module *Mod, Module::Header Header,
     Cb->moduleMapAddHeader(HeaderEntry.getName());
 }
 
+bool ModuleMap::parseModuleMapFile(FileEntryRef File, bool IsSystem,
+                                   DirectoryEntryRef Dir, FileID ID,
+                                   SourceLocation ExternModuleLoc) {
+  llvm::DenseMap<const FileEntry *, const modulemap::ModuleMapFile *>::iterator
+      Known = ParsedModuleMap.find(File);
+  if (Known != ParsedModuleMap.end())
+    return Known->second == nullptr;
+
+  // If the module map file wasn't already entered, do so now.
+  if (ID.isInvalid()) {
+    ID = SourceMgr.translateFile(File);
+    if (ID.isInvalid() || SourceMgr.isLoadedFileID(ID)) {
+      auto FileCharacter =
+          IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap;
+      ID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter);
+    }
+  }
+
+  std::optional<llvm::MemoryBufferRef> Buffer = SourceMgr.getBufferOrNone(ID);
+  if (!Buffer) {
+    ParsedModuleMap[File] = nullptr;
+    return true;
+  }
+
+  Diags.Report(diag::remark_mmap_parse) << File.getName();
+  std::optional<modulemap::ModuleMapFile> MaybeMMF =
+      modulemap::parseModuleMap(ID, Dir, SourceMgr, Diags, IsSystem, nullptr);
+
+  if (!MaybeMMF) {
+    ParsedModuleMap[File] = nullptr;
+    return true;
+  }
+
+  ParsedModuleMaps.push_back(
+      std::make_unique<modulemap::ModuleMapFile>(std::move(*MaybeMMF)));
+  const modulemap::ModuleMapFile &MMF = *ParsedModuleMaps.back();
+  std::vector<const modulemap::ExternModuleDecl *> PendingExternalModuleMaps;
+  for (const auto &Decl : MMF.Decls) {
+    std::visit(llvm::makeVisitor(
+                   [&](const modulemap::ModuleDecl &MD) {
+                     // Only use the first part of the name even for submodules.
+                     // This will correctly load the submodule declarations when
+                     // the module is loaded.
+                     auto &ModuleDecls =
+                         ParsedModules[StringRef(MD.Id.front().first)];
+                     ModuleDecls.push_back(std::pair(&MMF, &MD));
+                   },
+                   [&](const modulemap::ExternModuleDecl &EMD) {
+                     PendingExternalModuleMaps.push_back(&EMD);
+                   }),
+               Decl);
+  }
+
+  for (const modulemap::ExternModuleDecl *EMD : PendingExternalModuleMaps) {
+    StringRef FileNameRef = EMD->Path;
+    SmallString<128> ModuleMapFileName;
+    if (llvm::sys::path::is_relative(FileNameRef)) {
+      ModuleMapFileName += Dir.getName();
+      llvm::sys::path::append(ModuleMapFileName, EMD->Path);
+      FileNameRef = ModuleMapFileName;
+    }
+
+    if (auto EFile =
+            SourceMgr.getFileManager().getOptionalFileRef(FileNameRef)) {
+      parseModuleMapFile(*EFile, IsSystem, EFile->getDir(), FileID(),
+                         ExternModuleLoc);
+    }
+  }
+
+  ParsedModuleMap[File] = &MMF;
+
+  for (const auto &Cb : Callbacks)
+    Cb->moduleMapFileRead(SourceLocation(), File, IsSystem);
+
+  return false;
+}
+
 FileID ModuleMap::getContainingModuleMapFileID(const Module *Module) const {
   if (Module->DefinitionLoc.isInvalid())
     return {};
@@ -1458,7 +1537,6 @@ bool ModuleMap::resolveConflicts(Module *Mod, bool Complain) {
 
 namespace clang {
 class ModuleMapLoader {
-  modulemap::ModuleMapFile &MMF;
   SourceManager &SourceMgr;
 
   DiagnosticsEngine &Diags;
@@ -1515,13 +1593,15 @@ class ModuleMapLoader {
   using Attributes = ModuleMap::Attributes;
 
 public:
-  ModuleMapLoader(modulemap::ModuleMapFile &MMF, SourceManager &SourceMgr,
-                  DiagnosticsEngine &Diags, ModuleMap &Map, FileID ModuleMapFID,
+  ModuleMapLoader(SourceManager &SourceMgr, DiagnosticsEngine &Diags,
+                  ModuleMap &Map, FileID ModuleMapFID,
                   DirectoryEntryRef Directory, bool IsSystem)
-      : MMF(MMF), SourceMgr(SourceMgr), Diags(Diags), Map(Map),
+      : SourceMgr(SourceMgr), Diags(Diags), Map(Map),
         ModuleMapFID(ModuleMapFID), Directory(Directory), IsSystem(IsSystem) {}
 
-  bool loadModuleMapFile();
+  bool loadModuleDecl(const modulemap::ModuleDecl &MD);
+  bool loadExternModuleDecl(const modulemap::ExternModuleDecl &EMD);
+  bool parseAndLoadModuleMapFile(const modulemap::ModuleMapFile &MMF);
 };
 
 } // namespace clang
@@ -1660,7 +1740,11 @@ void ModuleMapLoader::handleModuleDecl(const modulemap::ModuleDecl &MD) {
         Map.LangOpts.CurrentModule == ModuleName &&
         SourceMgr.getDecomposedLoc(ModuleNameLoc).first !=
             SourceMgr.getDecomposedLoc(Existing->DefinitionLoc).first;
-    if (LoadedFromASTFile || Inferred || PartOfFramework || ParsedAsMainInput) {
+    // TODO: Remove this check when we can avoid loading module maps multiple
+    //       times.
+    bool SameModuleDecl = ModuleNameLoc == Existing->DefinitionLoc;
+    if (LoadedFromASTFile || Inferred || PartOfFramework || ParsedAsMainInput ||
+        SameModuleDecl) {
       ActiveModule = PreviousActiveModule;
       // Skip the module definition.
       return;
@@ -1773,7 +1857,7 @@ void ModuleMapLoader::handleExternModuleDecl(
     FileNameRef = ModuleMapFileName;
   }
   if (auto File = SourceMgr.getFileManager().getOptionalFileRef(FileNameRef))
-    Map.loadModuleMapFile(
+    Map.parseAndLoadModuleMapFile(
         *File, IsSystem,
         Map.HeaderInfo.getHeaderSearchOpts().ModuleMapFileHomeIsCwd
             ? Directory
@@ -2104,7 +2188,19 @@ void ModuleMapLoader::handleInferredModuleDecl(
   }
 }
 
-bool ModuleMapLoader::loadModuleMapFile() {
+bool ModuleMapLoader::loadModuleDecl(const modulemap::ModuleDecl &MD) {
+  handleModuleDecl(MD);
+  return HadError;
+}
+
+bool ModuleMapLoader::loadExternModuleDecl(
+    const modulemap::ExternModuleDecl &EMD) {
+  handleExternModuleDecl(EMD);
+  return HadError;
+}
+
+bool ModuleMapLoader::parseAndLoadModuleMapFile(
+    const modulemap::ModuleMapFile &MMF) {
   for (const auto &Decl : MMF.Decls) {
     std::visit(
         llvm::makeVisitor(
@@ -2117,10 +2213,32 @@ bool ModuleMapLoader::loadModuleMapFile() {
   return HadError;
 }
 
-bool ModuleMap::loadModuleMapFile(FileEntryRef File, bool IsSystem,
-                                  DirectoryEntryRef Dir, FileID ID,
-                                  unsigned *Offset,
-                                  SourceLocation ExternModuleLoc) {
+Module *ModuleMap::findOrLoadModule(StringRef Name) {
+  llvm::StringMap<Module *>::const_iterator Known = Modules.find(Name);
+  if (Known != Modules.end())
+    return Known->getValue();
+
+  auto ParsedMod = ParsedModules.find(Name);
+  if (ParsedMod == ParsedModules.end())
+    return nullptr;
+
+  Diags.Report(diag::remark_mmap_load_module) << Name;
+
+  for (const auto &ModuleDecl : ParsedMod->second) {
+    const modulemap::ModuleMapFile &MMF = *ModuleDecl.first;
+    ModuleMapLoader Loader(SourceMgr, Diags, const_cast<ModuleMap &>(*this),
+                           MMF.ID, *MMF.Dir, MMF.IsSystem);
+    if (Loader.loadModuleDecl(*ModuleDecl.second))
+      return nullptr;
+  }
+
+  return findModule(Name);
+}
+
+bool ModuleMap::parseAndLoadModuleMapFile(FileEntryRef File, bool IsSystem,
+                                          DirectoryEntryRef Dir, FileID ID,
+                                          unsigned *Offset,
+                                          SourceLocation ExternModuleLoc) {
   assert(Target && "Missing target information");
   llvm::DenseMap<const FileEntry *, bool>::iterator Known =
       LoadedModuleMap.find(File);
@@ -2129,9 +2247,16 @@ bool ModuleMap::loadModuleMapFile(FileEntryRef File, bool IsSystem,
 
   // If the module map file wasn't already entered, do so now.
   if (ID.isInvalid()) {
-    auto FileCharacter =
-        IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap;
-    ID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter);
+    ID = SourceMgr.translateFile(File);
+    // TODO: The way we compute affecting module maps requires this to be a
+    //       local FileID. This should be changed to reuse loaded FileIDs when
+    //       available, and change the way that affecting module maps are
+    //       computed to not require this.
+    if (ID.isInvalid() || SourceMgr.isLoadedFileID(ID)) {
+      auto FileCharacter =
+          IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap;
+      ID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter);
+    }
   }
 
   assert(Target && "Missing target information");
@@ -2145,8 +2270,9 @@ bool ModuleMap::loadModuleMapFile(FileEntryRef File, bool IsSystem,
       modulemap::parseModuleMap(ID, Dir, SourceMgr, Diags, IsSystem, Offset);
   bool Result = false;
   if (MMF) {
-    ModuleMapLoader Loader(*MMF, SourceMgr, Diags, *this, ID, Dir, IsSystem);
-    Result = Loader.loadModuleMapFile();
+    Diags.Report(diag::remark_mmap_load) << File.getName();
+    ModuleMapLoader Loader(SourceMgr, Diags, *this, ID, Dir, IsSystem);
+    Result = Loader.parseAndLoadModuleMapFile(*MMF);
   }
   LoadedModuleMap[File] = Result;
 
diff --git a/clang/lib/Lex/ModuleMapFile.cpp b/clang/lib/Lex/ModuleMapFile.cpp
index 5cf4a4c..f457de8 100644
--- a/clang/lib/Lex/ModuleMapFile.cpp
+++ b/clang/lib/Lex/ModuleMapFile.cpp
@@ -169,7 +169,10 @@ modulemap::parseModuleMap(FileID ID, clang::DirectoryEntryRef Dir,
 
   if (Failed)
     return std::nullopt;
+  Parser.MMF.ID = ID;
+  Parser.MMF.Dir = Dir;
   Parser.MMF.Start = Start;
+  Parser.MMF.IsSystem = IsSystem;
   return std::move(Parser.MMF);
 }
 
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 32b08a1..546c228 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -3706,8 +3706,10 @@ ExprResult Parser::ParseRequiresExpression() {
           SkipUntil(tok::semi, tok::r_brace, SkipUntilFlags::StopBeforeMatch);
           break;
         }
+        // If there's an error consuming the closing bracket, consumeClose()
+        // will handle skipping to the nearest recovery point for us.
         if (ExprBraces.consumeClose())
-          ExprBraces.skipToEnd();
+          break;
 
         concepts::Requirement *Req = nullptr;
         SourceLocation NoexceptLoc;
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index b354bb7..def909f 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -643,6 +643,9 @@ static void ProcessAPINotes(Sema &S, TagDecl *D, const api_notes::TagInfo &Info,
   if (auto ReleaseOp = Info.SwiftReleaseOp)
     D->addAttr(
         SwiftAttrAttr::Create(S.Context, "release:" + ReleaseOp.value()));
+  if (auto DefaultOwnership = Info.SwiftDefaultOwnership)
+    D->addAttr(SwiftAttrAttr::Create(
+        S.Context, "returned_as_" + DefaultOwnership.value() + "_by_default"));
 
   if (auto ConformsTo = Info.SwiftConformance)
     D->addAttr(
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7f45533..5dd231b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -11647,6 +11647,15 @@ static void DiagnoseFloatingImpCast(Sema &S, Expr *E, QualType T,
   }
 }
 
+static void CheckCommaOperand(Sema &S, Expr *E, QualType T, SourceLocation CC,
+                              bool ExtraCheckForImplicitConversion) {
+  E = E->IgnoreParenImpCasts();
+  AnalyzeImplicitConversions(S, E, CC);
+
+  if (ExtraCheckForImplicitConversion && E->getType() != T)
+    S.CheckImplicitConversion(E, T, CC);
+}
+
 /// Analyze the given compound assignment for the possible losing of
 /// floating-point precision.
 static void AnalyzeCompoundAssignment(Sema &S, BinaryOperator *E) {
@@ -12464,7 +12473,7 @@ static void AnalyzeImplicitConversions(
           << OrigE->getSourceRange() << T->isBooleanType()
           << FixItHint::CreateReplacement(UO->getBeginLoc(), "!");
 
-  if (const auto *BO = dyn_cast<BinaryOperator>(SourceExpr))
+  if (auto *BO = dyn_cast<BinaryOperator>(SourceExpr)) {
     if ((BO->getOpcode() == BO_And || BO->getOpcode() == BO_Or) &&
         BO->getLHS()->isKnownToHaveBooleanValue() &&
         BO->getRHS()->isKnownToHaveBooleanValue() &&
@@ -12490,7 +12499,21 @@ static void AnalyzeImplicitConversions(
                    (BO->getOpcode() == BO_And ? "&&" : "||"));
         S.Diag(BO->getBeginLoc(), diag::note_cast_operand_to_int);
       }
+    } else if (BO->isCommaOp() && !S.getLangOpts().CPlusPlus) {
+      /// Analyze the given comma operator. The basic idea behind the analysis
+      /// is to analyze the left and right operands slightly differently. The
+      /// left operand needs to check whether the operand itself has an implicit
+      /// conversion, but not whether the left operand induces an implicit
+      /// conversion for the entire comma expression itself. This is similar to
+      /// how CheckConditionalOperand behaves; it's as-if the correct operand
+      /// were directly used for the implicit conversion check.
+      CheckCommaOperand(S, BO->getLHS(), T, BO->getOperatorLoc(),
+                        /*ExtraCheckForImplicitConversion=*/false);
+      CheckCommaOperand(S, BO->getRHS(), T, BO->getOperatorLoc(),
+                        /*ExtraCheckForImplicitConversion=*/true);
+      return;
     }
+  }
 
   // For conditional operators, we analyze the arguments as if they
   // were being fed directly into the output.
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 6b561d7..5a45198 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -349,12 +349,11 @@ ParsedType Sema::getTypeName(const IdentifierInfo &II, SourceLocation NameLoc,
           if (AllowImplicitTypename == ImplicitTypenameContext::No)
             return nullptr;
           SourceLocation QualifiedLoc = SS->getRange().getBegin();
-          if (getLangOpts().CPlusPlus20)
-            Diag(QualifiedLoc, diag::warn_cxx17_compat_implicit_typename);
-          else
-            Diag(QualifiedLoc, diag::ext_implicit_typename)
-                << NestedNameSpecifier::Create(Context, SS->getScopeRep(), &II)
-                << FixItHint::CreateInsertion(QualifiedLoc, "typename ");
+          auto DB =
+              DiagCompat(QualifiedLoc, diag_compat::implicit_typename)
+              << NestedNameSpecifier::Create(Context, SS->getScopeRep(), &II);
+          if (!getLangOpts().CPlusPlus20)
+            DB << FixItHint::CreateInsertion(QualifiedLoc, "typename ");
         }
 
         // We know from the grammar that this name refers to a type,
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 6e8aa1a..c3ef5a70 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -251,7 +251,8 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
         << D->getDeclName();
     } else {
       Diag(Loc, diag::err_auto_variable_cannot_appear_in_own_initializer)
-        << D->getDeclName() << cast<VarDecl>(D)->getType();
+          << diag::ParsingInitFor::Var << D->getDeclName()
+          << cast<VarDecl>(D)->getType();
     }
     return true;
   }
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 8bdc230..b2a982e 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -6449,6 +6449,9 @@ void DiagnoseBuiltinDeprecation(Sema& S, TypeTrait Kind,
     case UTT_HasTrivialDestructor:
       Replacement = UTT_IsTriviallyDestructible;
       break;
+    case UTT_IsTriviallyRelocatable:
+      Replacement = clang::UTT_IsCppTriviallyRelocatable;
+      break;
     default:
       return;
   }
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 4bba571..bf1b76b 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -395,7 +395,7 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   case ModuleDeclKind::PartitionInterface: {
     // We can't have parsed or imported a definition of this module or parsed a
     // module map defining it already.
-    if (auto *M = Map.findModule(ModuleName)) {
+    if (auto *M = Map.findOrLoadModule(ModuleName)) {
       Diag(Path[0].getLoc(), diag::err_module_redefinition) << ModuleName;
       if (M->DefinitionLoc.isValid())
         Diag(M->DefinitionLoc, diag::note_prev_module_definition);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 95c7b6f..94f4c1c 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3775,12 +3775,10 @@ TypeResult Sema::ActOnTemplateIdType(
       NestedNameSpecifier *NNS =
           NestedNameSpecifier::Create(Context, SS.getScopeRep(), TemplateII);
       if (AllowImplicitTypename == ImplicitTypenameContext::Yes) {
-        if (getLangOpts().CPlusPlus20)
-          Diag(SS.getBeginLoc(), diag::warn_cxx17_compat_implicit_typename);
-        else
-          Diag(SS.getBeginLoc(), diag::ext_implicit_typename)
-              << NNS
-              << FixItHint::CreateInsertion(SS.getBeginLoc(), "typename ");
+        auto DB = DiagCompat(SS.getBeginLoc(), diag_compat::implicit_typename)
+                  << NNS;
+        if (!getLangOpts().CPlusPlus20)
+          DB << FixItHint::CreateInsertion(SS.getBeginLoc(), "typename ");
       } else
         Diag(SS.getBeginLoc(), diag::err_typename_missing_template) << NNS;
 
@@ -4372,8 +4370,43 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
   // Produce a placeholder value if the specialization is dependent.
   if (Template->getDeclContext()->isDependentContext() ||
       TemplateSpecializationType::anyDependentTemplateArguments(
-          TemplateArgs, CTAI.CanonicalConverted))
+          TemplateArgs, CTAI.CanonicalConverted)) {
+    if (ParsingInitForAutoVars.empty())
+      return DeclResult();
+
+    auto IsSameTemplateArg = [&](const TemplateArgument &Arg1,
+                                 const TemplateArgument &Arg2) {
+      return Context.isSameTemplateArgument(Arg1, Arg2);
+    };
+
+    if (VarDecl *Var = Template->getTemplatedDecl();
+        ParsingInitForAutoVars.count(Var) &&
+        llvm::equal(
+            CTAI.CanonicalConverted,
+            Template->getTemplateParameters()->getInjectedTemplateArgs(Context),
+            IsSameTemplateArg)) {
+      Diag(TemplateNameLoc,
+           diag::err_auto_variable_cannot_appear_in_own_initializer)
+          << diag::ParsingInitFor::VarTemplate << Var << Var->getType();
+      return true;
+    }
+
+    SmallVector<VarTemplatePartialSpecializationDecl *, 4> PartialSpecs;
+    Template->getPartialSpecializations(PartialSpecs);
+    for (VarTemplatePartialSpecializationDecl *Partial : PartialSpecs)
+      if (ParsingInitForAutoVars.count(Partial) &&
+          llvm::equal(CTAI.CanonicalConverted,
+                      Partial->getTemplateArgs().asArray(),
+                      IsSameTemplateArg)) {
+        Diag(TemplateNameLoc,
+             diag::err_auto_variable_cannot_appear_in_own_initializer)
+            << diag::ParsingInitFor::VarTemplatePartialSpec << Partial
+            << Partial->getType();
+        return true;
+      }
+
     return DeclResult();
+  }
 
   // Find the variable template specialization declaration that
   // corresponds to these arguments.
@@ -4381,6 +4414,20 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
   if (VarTemplateSpecializationDecl *Spec =
           Template->findSpecialization(CTAI.CanonicalConverted, InsertPos)) {
     checkSpecializationReachability(TemplateNameLoc, Spec);
+    if (Spec->getType()->isUndeducedType()) {
+      if (ParsingInitForAutoVars.count(Spec))
+        Diag(TemplateNameLoc,
+             diag::err_auto_variable_cannot_appear_in_own_initializer)
+            << diag::ParsingInitFor::VarTemplateExplicitSpec << Spec
+            << Spec->getType();
+      else
+        // We are substituting the initializer of this variable template
+        // specialization.
+        Diag(TemplateNameLoc, diag::err_var_template_spec_type_depends_on_self)
+            << Spec << Spec->getType();
+
+      return true;
+    }
     // If we already have a variable template specialization, return it.
     return Spec;
   }
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index c9de363..ee1ef84 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -114,27 +114,6 @@ namespace clang {
 using namespace clang;
 using namespace sema;
 
-/// Compare two APSInts, extending and switching the sign as
-/// necessary to compare their values regardless of underlying type.
-static bool hasSameExtendedValue(llvm::APSInt X, llvm::APSInt Y) {
-  if (Y.getBitWidth() > X.getBitWidth())
-    X = X.extend(Y.getBitWidth());
-  else if (Y.getBitWidth() < X.getBitWidth())
-    Y = Y.extend(X.getBitWidth());
-
-  // If there is a signedness mismatch, correct it.
-  if (X.isSigned() != Y.isSigned()) {
-    // If the signed value is negative, then the values cannot be the same.
-    if ((Y.isSigned() && Y.isNegative()) || (X.isSigned() && X.isNegative()))
-      return false;
-
-    Y.setIsSigned(true);
-    X.setIsSigned(true);
-  }
-
-  return X == Y;
-}
-
 /// The kind of PartialOrdering we're performing template argument deduction
 /// for (C++11 [temp.deduct.partial]).
 enum class PartialOrderingKind { None, NonCall, Call };
@@ -273,7 +252,7 @@ checkDeducedTemplateArguments(ASTContext &Context,
     if (Y.getKind() == TemplateArgument::Expression ||
         Y.getKind() == TemplateArgument::Declaration ||
         (Y.getKind() == TemplateArgument::Integral &&
-         hasSameExtendedValue(X.getAsIntegral(), Y.getAsIntegral())))
+         llvm::APSInt::isSameValue(X.getAsIntegral(), Y.getAsIntegral())))
       return X.wasDeducedFromArrayBound() ? Y : X;
 
     // All other combinations are incompatible.
@@ -2574,7 +2553,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
 
   case TemplateArgument::Integral:
     if (A.getKind() == TemplateArgument::Integral) {
-      if (hasSameExtendedValue(P.getAsIntegral(), A.getAsIntegral()))
+      if (llvm::APSInt::isSameValue(P.getAsIntegral(), A.getAsIntegral()))
         return TemplateDeductionResult::Success;
     }
     Info.FirstArg = P;
@@ -2828,62 +2807,6 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
       /*HasDeducedAnyParam=*/nullptr);
 }
 
-/// Determine whether two template arguments are the same.
-static bool isSameTemplateArg(ASTContext &Context, const TemplateArgument &X,
-                              const TemplateArgument &Y) {
-  if (X.getKind() != Y.getKind())
-    return false;
-
-  switch (X.getKind()) {
-    case TemplateArgument::Null:
-      llvm_unreachable("Comparing NULL template argument");
-
-    case TemplateArgument::Type:
-      return Context.getCanonicalType(X.getAsType()) ==
-             Context.getCanonicalType(Y.getAsType());
-
-    case TemplateArgument::Declaration:
-      return isSameDeclaration(X.getAsDecl(), Y.getAsDecl());
-
-    case TemplateArgument::NullPtr:
-      return Context.hasSameType(X.getNullPtrType(), Y.getNullPtrType());
-
-    case TemplateArgument::Template:
-    case TemplateArgument::TemplateExpansion:
-      return Context.getCanonicalTemplateName(
-                    X.getAsTemplateOrTemplatePattern()).getAsVoidPointer() ==
-             Context.getCanonicalTemplateName(
-                    Y.getAsTemplateOrTemplatePattern()).getAsVoidPointer();
-
-    case TemplateArgument::Integral:
-      return hasSameExtendedValue(X.getAsIntegral(), Y.getAsIntegral());
-
-    case TemplateArgument::StructuralValue:
-      return X.structurallyEquals(Y);
-
-    case TemplateArgument::Expression: {
-      llvm::FoldingSetNodeID XID, YID;
-      X.getAsExpr()->Profile(XID, Context, true);
-      Y.getAsExpr()->Profile(YID, Context, true);
-      return XID == YID;
-    }
-
-    case TemplateArgument::Pack: {
-      unsigned PackIterationSize = X.pack_size();
-      if (X.pack_size() != Y.pack_size())
-        return false;
-      ArrayRef<TemplateArgument> XP = X.pack_elements();
-      ArrayRef<TemplateArgument> YP = Y.pack_elements();
-      for (unsigned i = 0; i < PackIterationSize; ++i)
-        if (!isSameTemplateArg(Context, XP[i], YP[i]))
-          return false;
-      return true;
-    }
-  }
-
-  llvm_unreachable("Invalid TemplateArgument Kind!");
-}
-
 TemplateArgumentLoc
 Sema::getTrivialTemplateArgumentLoc(const TemplateArgument &Arg,
                                     QualType NTTPType, SourceLocation Loc,
@@ -3349,7 +3272,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
       break;
     TemplateArgument PP = P.isPackExpansion() ? P.getPackExpansionPattern() : P,
                      PA = A.isPackExpansion() ? A.getPackExpansionPattern() : A;
-    if (!isSameTemplateArg(S.Context, PP, PA)) {
+    if (!S.Context.isSameTemplateArgument(PP, PA)) {
       if (!P.isPackExpansion() && !A.isPackExpansion()) {
         Info.Param = makeTemplateParameter(TPL->getParam(
             (AsStack.empty() ? As.end() : AsStack.back().begin()) -
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
index 320ee0e..ad277f1 100644
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -510,15 +510,3 @@ bool serialization::needsAnonymousDeclarationNumber(const NamedDecl *D) {
     return false;
   return isa<TagDecl, FieldDecl>(D);
 }
-
-void serialization::updateModuleTimestamp(StringRef ModuleFilename) {
-  // Overwrite the timestamp file contents so that file's mtime changes.
-  std::error_code EC;
-  llvm::raw_fd_ostream OS(ModuleFile::getTimestampFilename(ModuleFilename), EC,
-                          llvm::sys::fs::OF_TextWithCRLF);
-  if (EC)
-    return;
-  OS << "Timestamp file\n";
-  OS.close();
-  OS.clear_error(); // Avoid triggering a fatal error.
-}
diff --git a/clang/lib/Serialization/ASTCommon.h b/clang/lib/Serialization/ASTCommon.h
index ed6b8d0..371db4b 100644
--- a/clang/lib/Serialization/ASTCommon.h
+++ b/clang/lib/Serialization/ASTCommon.h
@@ -100,8 +100,6 @@ inline bool isPartOfPerModuleInitializer(const Decl *D) {
   return false;
 }
 
-void updateModuleTimestamp(StringRef ModuleFilename);
-
 } // namespace serialization
 
 } // namespace clang
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index e47bac0..a17d622 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -4952,7 +4952,8 @@ ASTReader::ASTReadResult ASTReader::ReadAST(StringRef FileName, ModuleKind Type,
       ImportedModule &M = Loaded[I];
       if (M.Mod->Kind == MK_ImplicitModule &&
           M.Mod->InputFilesValidationTimestamp < HSOpts.BuildSessionTimestamp)
-        updateModuleTimestamp(M.Mod->FileName);
+        getModuleManager().getModuleCache().updateModuleTimestamp(
+            M.Mod->FileName);
     }
   }
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 8c5adc3..cccf53d 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5394,7 +5394,7 @@ ASTWriter::WriteAST(llvm::PointerUnion<Sema *, Preprocessor *> Subject,
   if (WritingModule && PPRef.getHeaderSearchInfo()
                            .getHeaderSearchOpts()
                            .ModulesValidateOncePerBuildSession)
-    updateModuleTimestamp(OutputFile);
+    ModCache.updateModuleTimestamp(OutputFile);
 
   if (ShouldCacheASTInMemory) {
     // Construct MemoryBuffer and update buffer manager.
diff --git a/clang/lib/Serialization/ModuleCache.cpp b/clang/lib/Serialization/ModuleCache.cpp
index 955e5f3..4ae49c4 100644
--- a/clang/lib/Serialization/ModuleCache.cpp
+++ b/clang/lib/Serialization/ModuleCache.cpp
@@ -9,6 +9,7 @@
 #include "clang/Serialization/ModuleCache.h"
 
 #include "clang/Serialization/InMemoryModuleCache.h"
+#include "clang/Serialization/ModuleFile.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LockFileManager.h"
 #include "llvm/Support/Path.h"
@@ -32,6 +33,28 @@ public:
     return std::make_unique<llvm::LockFileManager>(ModuleFilename);
   }
 
+  std::time_t getModuleTimestamp(StringRef ModuleFilename) override {
+    std::string TimestampFilename =
+        serialization::ModuleFile::getTimestampFilename(ModuleFilename);
+    llvm::sys::fs::file_status Status;
+    if (llvm::sys::fs::status(ModuleFilename, Status) != std::error_code{})
+      return 0;
+    return llvm::sys::toTimeT(Status.getLastModificationTime());
+  }
+
+  void updateModuleTimestamp(StringRef ModuleFilename) override {
+    // Overwrite the timestamp file contents so that file's mtime changes.
+    std::error_code EC;
+    llvm::raw_fd_ostream OS(
+        serialization::ModuleFile::getTimestampFilename(ModuleFilename), EC,
+        llvm::sys::fs::OF_TextWithCRLF);
+    if (EC)
+      return;
+    OS << "Timestamp file\n";
+    OS.close();
+    OS.clear_error(); // Avoid triggering a fatal error.
+  }
+
   InMemoryModuleCache &getInMemoryModuleCache() override { return InMemory; }
   const InMemoryModuleCache &getInMemoryModuleCache() const override {
     return InMemory;
diff --git a/clang/lib/Serialization/ModuleManager.cpp b/clang/lib/Serialization/ModuleManager.cpp
index d466ea0..e3d7ff4 100644
--- a/clang/lib/Serialization/ModuleManager.cpp
+++ b/clang/lib/Serialization/ModuleManager.cpp
@@ -174,15 +174,9 @@ ModuleManager::addModule(StringRef FileName, ModuleKind Type,
   NewModule->ImportLoc = ImportLoc;
   NewModule->InputFilesValidationTimestamp = 0;
 
-  if (NewModule->Kind == MK_ImplicitModule) {
-    std::string TimestampFilename =
-        ModuleFile::getTimestampFilename(NewModule->FileName);
-    llvm::vfs::Status Status;
-    // A cached stat value would be fine as well.
-    if (!FileMgr.getNoncachedStatValue(TimestampFilename, Status))
-      NewModule->InputFilesValidationTimestamp =
-          llvm::sys::toTimeT(Status.getLastModificationTime());
-  }
+  if (NewModule->Kind == MK_ImplicitModule)
+    NewModule->InputFilesValidationTimestamp =
+        ModCache->getModuleTimestamp(NewModule->FileName);
 
   // Load the contents of the module
   if (std::unique_ptr<llvm::MemoryBuffer> Buffer = lookupBuffer(FileName)) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
index 0a658b5..01faa92 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
@@ -381,6 +381,9 @@ public:
         }
         QualType CapturedVarQualType = CapturedVar->getType();
         auto IsUncountedPtr = isUnsafePtr(CapturedVar->getType());
+        if (C.getCaptureKind() == LCK_ByCopy &&
+            CapturedVarQualType->isReferenceType())
+          continue;
         if (IsUncountedPtr && *IsUncountedPtr)
           reportBug(C, CapturedVar, CapturedVarQualType, L);
       } else if (C.capturesThis() && shouldCheckThis) {
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index 92ce3fa..e07e24f 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/AttrIterator.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ParentMap.h"
@@ -23,6 +24,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Casting.h"
 #include <optional>
 
 using namespace clang;
@@ -715,7 +717,11 @@ void ExprEngine::handleConstructor(const Expr *E,
         // actually make things worse. Placement new makes this tricky as well,
         // since it's then possible to be initializing one part of a multi-
         // dimensional array.
-        State = State->bindDefaultZero(Target, LCtx);
+        const CXXRecordDecl *TargetHeldRecord =
+            dyn_cast_or_null<CXXRecordDecl>(CE->getType()->getAsRecordDecl());
+
+        if (!TargetHeldRecord || !TargetHeldRecord->isEmpty())
+          State = State->bindDefaultZero(Target, LCtx);
       }
 
       Bldr.generateNode(CE, N, State, /*tag=*/nullptr,
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp
index 96fe40c..7f40c99 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp
@@ -14,6 +14,8 @@ using namespace dependencies;
 
 DependencyScanningService::DependencyScanningService(
     ScanningMode Mode, ScanningOutputFormat Format,
-    ScanningOptimizations OptimizeArgs, bool EagerLoadModules, bool TraceVFS)
+    ScanningOptimizations OptimizeArgs, bool EagerLoadModules, bool TraceVFS,
+    std::time_t BuildSessionTimestamp)
     : Mode(Mode), Format(Format), OptimizeArgs(OptimizeArgs),
-      EagerLoadModules(EagerLoadModules), TraceVFS(TraceVFS) {}
+      EagerLoadModules(EagerLoadModules), TraceVFS(TraceVFS),
+      BuildSessionTimestamp(BuildSessionTimestamp) {}
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 2443918..5c9cf3e 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -411,7 +411,7 @@ public:
     Scanned = true;
 
     // Create a compiler instance to handle the actual work.
-    auto ModCache = makeInProcessModuleCache(Service.getModuleCacheMutexes());
+    auto ModCache = makeInProcessModuleCache(Service.getModuleCacheEntries());
     ScanInstanceStorage.emplace(std::move(Invocation),
                                 std::move(PCHContainerOps), ModCache.get());
     CompilerInstance &ScanInstance = *ScanInstanceStorage;
@@ -428,6 +428,10 @@ public:
     ScanInstance.getPreprocessorOpts().AllowPCHWithDifferentModulesCachePath =
         true;
 
+    if (ScanInstance.getHeaderSearchOpts().ModulesValidateOncePerBuildSession)
+      ScanInstance.getHeaderSearchOpts().BuildSessionTimestamp =
+          Service.getBuildSessionTimestamp();
+
     ScanInstance.getFrontendOpts().GenerateGlobalModuleIndex = false;
     ScanInstance.getFrontendOpts().UseGlobalModuleIndex = false;
     // This will prevent us compiling individual modules asynchronously since
diff --git a/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp b/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp
index 71ce4d0..80db2d4 100644
--- a/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp
+++ b/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp
@@ -10,6 +10,7 @@
 
 #include "clang/Serialization/InMemoryModuleCache.h"
 #include "llvm/Support/AdvisoryLock.h"
+#include "llvm/Support/Chrono.h"
 
 #include <mutex>
 
@@ -50,7 +51,7 @@ public:
 };
 
 class InProcessModuleCache : public ModuleCache {
-  ModuleCacheMutexes &Mutexes;
+  ModuleCacheEntries &Entries;
 
   // TODO: If we changed the InMemoryModuleCache API and relied on strict
   // context hash, we could probably create more efficient thread-safe
@@ -59,19 +60,44 @@ class InProcessModuleCache : public ModuleCache {
   InMemoryModuleCache InMemory;
 
 public:
-  InProcessModuleCache(ModuleCacheMutexes &Mutexes) : Mutexes(Mutexes) {}
+  InProcessModuleCache(ModuleCacheEntries &Entries) : Entries(Entries) {}
 
   void prepareForGetLock(StringRef Filename) override {}
 
   std::unique_ptr<llvm::AdvisoryLock> getLock(StringRef Filename) override {
-    auto &Mtx = [&]() -> std::shared_mutex & {
-      std::lock_guard<std::mutex> Lock(Mutexes.Mutex);
-      auto &Mutex = Mutexes.Map[Filename];
-      if (!Mutex)
-        Mutex = std::make_unique<std::shared_mutex>();
-      return *Mutex;
+    auto &CompilationMutex = [&]() -> std::shared_mutex & {
+      std::lock_guard<std::mutex> Lock(Entries.Mutex);
+      auto &Entry = Entries.Map[Filename];
+      if (!Entry)
+        Entry = std::make_unique<ModuleCacheEntry>();
+      return Entry->CompilationMutex;
     }();
-    return std::make_unique<ReaderWriterLock>(Mtx);
+    return std::make_unique<ReaderWriterLock>(CompilationMutex);
+  }
+
+  std::time_t getModuleTimestamp(StringRef Filename) override {
+    auto &Timestamp = [&]() -> std::atomic<std::time_t> & {
+      std::lock_guard<std::mutex> Lock(Entries.Mutex);
+      auto &Entry = Entries.Map[Filename];
+      if (!Entry)
+        Entry = std::make_unique<ModuleCacheEntry>();
+      return Entry->Timestamp;
+    }();
+
+    return Timestamp.load();
+  }
+
+  void updateModuleTimestamp(StringRef Filename) override {
+    // Note: This essentially replaces FS contention with mutex contention.
+    auto &Timestamp = [&]() -> std::atomic<std::time_t> & {
+      std::lock_guard<std::mutex> Lock(Entries.Mutex);
+      auto &Entry = Entries.Map[Filename];
+      if (!Entry)
+        Entry = std::make_unique<ModuleCacheEntry>();
+      return Entry->Timestamp;
+    }();
+
+    Timestamp.store(llvm::sys::toTimeT(std::chrono::system_clock::now()));
   }
 
   InMemoryModuleCache &getInMemoryModuleCache() override { return InMemory; }
@@ -82,6 +108,6 @@ public:
 } // namespace
 
 IntrusiveRefCntPtr<ModuleCache>
-dependencies::makeInProcessModuleCache(ModuleCacheMutexes &Mutexes) {
-  return llvm::makeIntrusiveRefCnt<InProcessModuleCache>(Mutexes);
+dependencies::makeInProcessModuleCache(ModuleCacheEntries &Entries) {
+  return llvm::makeIntrusiveRefCnt<InProcessModuleCache>(Entries);
 }
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
index 88e0da1..66fc46e 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
@@ -14,6 +14,11 @@ Tags:
   SwiftReleaseOp: RCRelease
   SwiftRetainOp: RCRetain
   SwiftConformsTo: MySwiftModule.MySwiftRefCountedProtocol
+- Name: RefCountedTypeWithDefaultConvention
+  SwiftImportAs: reference
+  SwiftReleaseOp: release
+  SwiftRetainOp: retain
+  SwiftDefaultOwnership: unretained
 - Name: NonCopyableType
   SwiftCopyable: false
   SwiftConformsTo: MySwiftModule.MySwiftNonCopyableProtocol
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
index b6900fe..20b8f04 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
@@ -19,3 +19,7 @@ struct CopyableType { int value; };
 
 struct NonEscapableType { int value; };
 struct EscapableType { int value; };
+
+struct RefCountedTypeWithDefaultConvention {};
+inline void retain(RefCountedType *x) {}
+inline void release(RefCountedType *x) {}
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
index 3981ef1..929f924 100644
--- a/clang/test/APINotes/swift-import-as.cpp
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImmortalRefType | FileCheck -check-prefix=CHECK-IMMORTAL %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedType | FileCheck -check-prefix=CHECK-REF-COUNTED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedTypeWithDefaultConvention | FileCheck -check-prefix=CHECK-REF-COUNTED-DEFAULT %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter NonCopyableType | FileCheck -check-prefix=CHECK-NON-COPYABLE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter CopyableType | FileCheck -check-prefix=CHECK-COPYABLE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter NonEscapableType | FileCheck -check-prefix=CHECK-NON-ESCAPABLE %s
@@ -26,6 +27,13 @@
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "release:RCRelease"
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "conforms_to:MySwiftModule.MySwiftRefCountedProtocol"
 
+// CHECK-REF-COUNTED-DEFAULT: Dumping RefCountedTypeWithDefaultConvention:
+// CHECK-REF-COUNTED-DEFAULT-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct RefCountedTypeWithDefaultConvention
+// CHECK-REF-COUNTED-DEFAULT: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
+// CHECK-REF-COUNTED-DEFAULT: SwiftAttrAttr {{.+}} <<invalid sloc>> "retain:retain"
+// CHECK-REF-COUNTED-DEFAULT: SwiftAttrAttr {{.+}} <<invalid sloc>> "release:release"
+// CHECK-REF-COUNTED-DEFAULT: SwiftAttrAttr {{.+}} <<invalid sloc>> "returned_as_unretained_by_default"
+
 // CHECK-NON-COPYABLE: Dumping NonCopyableType:
 // CHECK-NON-COPYABLE-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct NonCopyableType
 // CHECK-NON-COPYABLE: SwiftAttrAttr {{.+}} <<invalid sloc>> "conforms_to:MySwiftModule.MySwiftNonCopyableProtocol"
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
index daa15d5..6b7593a 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
@@ -137,13 +137,11 @@ void references() {
   RefCountable automatic;
   RefCountable& ref_countable_ref = automatic;
   auto foo1 = [ref_countable_ref](){ ref_countable_ref.constMethod(); };
-  // expected-warning@-1{{Captured reference 'ref_countable_ref' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
   auto foo2 = [&ref_countable_ref](){ ref_countable_ref.method(); };
   // expected-warning@-1{{Captured reference 'ref_countable_ref' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
   auto foo3 = [&](){ ref_countable_ref.method(); };
   // expected-warning@-1{{Implicitly captured reference 'ref_countable_ref' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
   auto foo4 = [=](){ ref_countable_ref.constMethod(); };
-  // expected-warning@-1{{Implicitly captured reference 'ref_countable_ref' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
 
   call(foo1);
   call(foo2);
@@ -407,3 +405,14 @@ void lambda_converted_to_function(RefCountable* obj)
     // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
   });
 }
+
+void capture_copy_in_lambda(CheckedObj& checked) {
+  callFunctionOpaque([checked]() mutable {
+    checked.method();
+  });
+  auto* ptr = &checked;
+  callFunctionOpaque([ptr]() mutable {
+    // expected-warning@-1{{Captured raw-pointer 'ptr' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+    ptr->method();
+  });
+}
diff --git a/clang/test/Analysis/issue-137252.cpp b/clang/test/Analysis/issue-137252.cpp
new file mode 100644
index 0000000..6ca3e20
--- /dev/null
+++ b/clang/test/Analysis/issue-137252.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=cplusplus -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=cplusplus -verify %s -DEMPTY_CLASS
+// UNSUPPORTED: system-windows
+// expected-no-diagnostics
+
+// This test reproduces the issue that previously the static analyzer
+// initialized an [[no_unique_address]] empty field to zero,
+// over-writing a non-empty field with the same offset.
+
+namespace std {
+#ifdef EMPTY_CLASS
+
+  struct default_delete {};
+  template <class _Tp, class _Dp = default_delete >
+#else
+  // Class with methods and static members is still empty:
+  template <typename T>
+  class default_delete {
+    T dump();
+    static T x;
+  };
+  template <class _Tp, class _Dp = default_delete<_Tp> >
+#endif
+  class unique_ptr {
+    [[no_unique_address]]  _Tp * __ptr_;
+    [[no_unique_address]] _Dp __deleter_;
+
+  public:
+    explicit unique_ptr(_Tp* __p) noexcept
+      : __ptr_(__p),
+        __deleter_() {}
+
+    ~unique_ptr() {
+      delete __ptr_;
+    }
+  };
+}
+
+struct X {};
+
+int main()
+{
+  // Previously a leak falsely reported here.  It was because the
+  // Static Analyzer engine simulated the initialization of
+  // `__deleter__` incorrectly.  The engine assigned zero to
+  // `__deleter__`--an empty record sharing offset with `__ptr__`.
+  // The assignment over wrote `__ptr__`.
+  std::unique_ptr<X> a(new X()); 
+  return 0;
+}
diff --git a/clang/test/C/C2y/n3369.c b/clang/test/C/C2y/n3369.c
index 389828b..db26040 100644
--- a/clang/test/C/C2y/n3369.c
+++ b/clang/test/C/C2y/n3369.c
@@ -17,7 +17,11 @@
 #error "Expected to have _Countof support"
 #endif
 
+#define NULL  ((void *) 0)
+
 int global_array[12];
+int global_multi_array[12][34];
+int global_num;
 
 void test_parsing_failures() {
   (void)_Countof;     // expected-error {{expected expression}}
@@ -36,6 +40,12 @@ void test_semantic_failures() {
                                 expected-note {{forward declaration of 'struct S'}}
   struct T { int x; };
   (void)_Countof(struct T);  // expected-error {{'_Countof' requires an argument of array type; 'struct T' invalid}}
+  struct U { int x[3]; };
+  (void)_Countof(struct U);  // expected-error {{'_Countof' requires an argument of array type; 'struct U' invalid}}
+  int a[3];
+  (void)_Countof(&a);  // expected-error {{'_Countof' requires an argument of array type; 'int (*)[3]' invalid}}
+  int *p;
+  (void)_Countof(p);  // expected-error {{'_Countof' requires an argument of array type; 'int *' invalid}}
 }
 
 void test_constant_expression_behavior(int n) {
@@ -81,6 +91,22 @@ void test_with_function_param(int array[12], int (*array_ptr)[12], int static_ar
   (void)_Countof(static_array); // expected-error {{'_Countof' requires an argument of array type; 'int *' invalid}}
 }
 
+void test_func_fix_fix(int i, char (*a)[3][5], int (*x)[_Countof(*a)], char (*)[_Generic(x, int (*)[3]: 1)]);  // expected-note {{passing argument to parameter}}
+void test_func_fix_var(int i, char (*a)[3][i], int (*x)[_Countof(*a)], char (*)[_Generic(x, int (*)[3]: 1)]);  // expected-note {{passing argument to parameter}}
+void test_func_fix_uns(int i, char (*a)[3][*], int (*x)[_Countof(*a)], char (*)[_Generic(x, int (*)[3]: 1)]);  // expected-note {{passing argument to parameter}}
+
+void test_funcs() {
+  int i3[3];
+  int i5[5];
+  char c35[3][5];
+  test_func_fix_fix(5, &c35, &i3, NULL);
+  test_func_fix_fix(5, &c35, &i5, NULL); // expected-warning {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
+  test_func_fix_var(5, &c35, &i3, NULL);
+  test_func_fix_var(5, &c35, &i5, NULL); // expected-warning {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
+  test_func_fix_uns(5, &c35, &i3, NULL);
+  test_func_fix_uns(5, &c35, &i5, NULL); // expected-warning {{incompatible pointer types passing 'int (*)[5]' to parameter of type 'int (*)[3]'}}
+}
+
 void test_multidimensional_arrays() {
   int array[12][7];
   static_assert(_Countof(array) == 12);
@@ -102,6 +128,11 @@ void test_unspecified_array_length() {
   static_assert(_Countof(**x) == 3);
 }
 
+void test_completed_array() {
+  int a[] = {1, 2, global_num};
+  static_assert(_Countof(a) == 3);
+}
+
 // Test that the return type of _Countof is what you'd expect (size_t).
 void test_return_type() {
   static_assert(_Generic(typeof(_Countof global_array), typeof(sizeof(0)) : 1, default : 0));
@@ -121,10 +152,14 @@ void test_typedefs() {
   static_assert(_Countof(*x) == 12);
 }
 
-void test_zero_size_arrays() {
+void test_zero_size_arrays(int n) {
   int array[0]; // expected-warning {{zero size arrays are an extension}}
   static_assert(_Countof(array) == 0);
   static_assert(_Countof(int[0]) == 0); // expected-warning {{zero size arrays are an extension}}
+  int multi_array[0][n]; // FIXME: Should trigger -Wzero-length-array
+  static_assert(_Countof(multi_array) == 0);
+  int another_one[0][3]; // expected-warning {{zero size arrays are an extension}}
+  static_assert(_Countof(another_one) == 0);
 }
 
 void test_struct_members() {
@@ -144,3 +179,18 @@ void test_compound_literals() {
   static_assert(_Countof((int[2]){}) == 2);
   static_assert(_Countof((int[]){1, 2, 3, 4}) == 4);	
 }
+
+/* We don't get a diagnostic for test_f1(), because it ends up unused
+ * as _Countof() results in an integer constant expression, which is not
+ * evaluated.  However, test_f2() ends up being evaluated, since 'a' is
+ * a VLA.
+ */
+static int test_f1();
+static int test_f2(); // FIXME: Should trigger function 'test_f2' has internal linkage but is not defined
+
+void test_symbols() {
+  int a[global_num][global_num];
+
+  static_assert(_Countof(global_multi_array[test_f1()]) == 34);
+  (void)_Countof(a[test_f2()]);
+}
diff --git a/clang/test/CIR/CodeGen/nonzeroinit-struct.cpp b/clang/test/CIR/CodeGen/nonzeroinit-struct.cpp
new file mode 100644
index 0000000..76832d1
--- /dev/null
+++ b/clang/test/CIR/CodeGen/nonzeroinit-struct.cpp
@@ -0,0 +1,19 @@
+// RUN: not %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - 2>&1 | FileCheck %s
+
+struct Other {
+    int x;
+};
+
+struct Trivial {
+    int x;
+    double y;
+    decltype(&Other::x) ptr;
+};
+
+// This case has a trivial default constructor, but can't be zero-initialized.
+Trivial t;
+
+// Since the case above isn't handled yet, we want a test that verifies that
+// we're failing for the right reason.
+
+// CHECK: error: ClangIR code gen Not Yet Implemented: tryEmitPrivateForVarInit: non-zero-initializable cxx record
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index 0d939dd..208d8f1 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -12,6 +12,17 @@ IncompleteS *p;
 // LLVM: @p = dso_local global ptr null
 // OGCG: @p = global ptr null, align 8
 
+struct CompleteS {
+  int a;
+  char b;
+};
+
+CompleteS cs;
+
+// CIR:       cir.global external @cs = #cir.zero : !rec_CompleteS
+// LLVM-DAG:  @cs = dso_local global %struct.CompleteS zeroinitializer
+// OGCG-DAG:  @cs = global %struct.CompleteS zeroinitializer, align 4
+
 void f(void) {
   IncompleteS *p;
 }
@@ -28,3 +39,29 @@ void f(void) {
 // OGCG-NEXT: entry:
 // OGCG-NEXT:   %[[P:.*]] = alloca ptr, align 8
 // OGCG-NEXT:   ret void
+
+char f2(CompleteS &s) {
+  return s.b;
+}
+
+// CIR: cir.func @_Z2f2R9CompleteS(%[[ARG_S:.*]]: !cir.ptr<!rec_CompleteS>{{.*}})
+// CIR:   %[[S_ADDR:.*]] = cir.alloca !cir.ptr<!rec_CompleteS>, !cir.ptr<!cir.ptr<!rec_CompleteS>>, ["s", init, const]
+// CIR:   cir.store %[[ARG_S]], %[[S_ADDR]]
+// CIR:   %[[S_REF:.*]] = cir.load %[[S_ADDR]]
+// CIR:   %[[S_ADDR2:.*]] = cir.get_member %[[S_REF]][1] {name = "b"}
+// CIR:   %[[S_B:.*]] = cir.load %[[S_ADDR2]]
+
+// LLVM: define i8 @_Z2f2R9CompleteS(ptr %[[ARG_S:.*]])
+// LLVM:   %[[S_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARG_S]], ptr %[[S_ADDR]]
+// LLVM:   %[[S_REF:.*]] = load ptr, ptr %[[S_ADDR]], align 8
+// LLVM:   %[[S_ADDR2:.*]] = getelementptr %struct.CompleteS, ptr %[[S_REF]], i32 0, i32 1
+// LLVM:   %[[S_B:.*]] = load i8, ptr %[[S_ADDR2]]
+
+// OGCG: define{{.*}} i8 @_Z2f2R9CompleteS(ptr{{.*}} %[[ARG_S:.*]])
+// OGCG: entry:
+// OGCG:   %[[S_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG_S]], ptr %[[S_ADDR]]
+// OGCG:   %[[S_REF:.*]] = load ptr, ptr %[[S_ADDR]]
+// OGCG:   %[[S_ADDR2:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[S_REF]], i32 0, i32 1
+// OGCG:   %[[S_B:.*]] = load i8, ptr %[[S_ADDR2]]
diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp
index 3652375..0bd4e07 100644
--- a/clang/test/CIR/CodeGen/switch.cpp
+++ b/clang/test/CIR/CodeGen/switch.cpp
@@ -16,8 +16,9 @@ void sw1(int a) {
   }
   }
 }
+
 // CIR: cir.func @_Z3sw1i
-// CIR: cir.switch (%3 : !s32i) {
+// CIR: cir.switch (%[[COND:.*]] : !s32i) {
 // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR: cir.break
 // CIR: cir.case(equal, [#cir.int<1> : !s32i]) {
@@ -66,12 +67,12 @@ void sw2(int a) {
 
 // CIR: cir.func @_Z3sw2i
 // CIR: cir.scope {
-// CIR-NEXT:   %1 = cir.alloca !s32i, !cir.ptr<!s32i>, ["yolo", init]
-// CIR-NEXT:   %2 = cir.alloca !s32i, !cir.ptr<!s32i>, ["fomo", init]
-// CIR:        cir.switch (%4 : !s32i) {
+// CIR-NEXT:   %[[YOLO:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["yolo", init]
+// CIR-NEXT:   %[[FOMO:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["fomo", init]
+// CIR:        cir.switch (%[[COND:.*]] : !s32i) {
 // CIR-NEXT:   cir.case(equal, [#cir.int<3> : !s32i]) {
-// CIR-NEXT:     %5 = cir.const #cir.int<0> : !s32i
-// CIR-NEXT:     cir.store %5, %2 : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:     %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR-NEXT:     cir.store %[[ZERO]], %[[FOMO]] : !s32i, !cir.ptr<!s32i>
 
 // OGCG: define dso_local void @_Z3sw2i
 // OGCG: entry:
@@ -91,45 +92,80 @@ void sw2(int a) {
 // OGCG: [[SW_EPILOG]]:
 // OGCG:   ret void
 
+void sw3(int a) {
+  switch (a) {
+  default:
+    break;
+  }
+}
+
+// CIR: cir.func @_Z3sw3i
+// CIR: cir.scope {
+// CIR-NEXT:   %[[COND:.*]] = cir.load %[[A:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:   cir.switch (%[[COND]] : !s32i) {
+// CIR-NEXT:   cir.case(default, []) {
+// CIR-NEXT:     cir.break
+// CIR-NEXT:   }
+// CIR-NEXT:   cir.yield
+// CIR-NEXT:   }
+
+// OGCG: define dso_local void @_Z3sw3i
+// OGCG: entry:
+// OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG:   switch i32 %[[A_VAL]], label %[[DEFAULT:.*]] [
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[EPILOG:.*]]
+// OGCG: [[EPILOG]]:
+// OGCG:   ret void
+
 int sw4(int a) {
   switch (a) {
   case 42: {
     return 3;
   }
-  // TODO: add default case when it is upstreamed
+  default:
+    return 2;
   }
   return 0;
 }
 
 // CIR: cir.func @_Z3sw4i
-// CIR:       cir.switch (%4 : !s32i) {
+// CIR:       cir.switch (%[[COND:.*]] : !s32i) {
 // CIR-NEXT:       cir.case(equal, [#cir.int<42> : !s32i]) {
 // CIR-NEXT:         cir.scope {
-// CIR-NEXT:           %5 = cir.const #cir.int<3> : !s32i
-// CIR-NEXT:           cir.store %5, %1 : !s32i, !cir.ptr<!s32i>
-// CIR-NEXT:           %6 = cir.load %1 : !cir.ptr<!s32i>, !s32i
-// CIR-NEXT:           cir.return %6 : !s32i
+// CIR-NEXT:           %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+// CIR-NEXT:           cir.store %[[THREE]], %[[RETVAL:.*]] : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:           %[[RET3:.*]] = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:           cir.return %[[RET3]] : !s32i
 // CIR-NEXT:         }
 // CIR-NEXT:         cir.yield
 // CIR-NEXT:       }
+// CIR-NEXT:       cir.case(default, []) {
+// CIR-NEXT:         %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+// CIR-NEXT:         cir.store %[[TWO]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:         %[[RET2:.*]] = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:         cir.return %[[RET2]] : !s32i
+// CIR-NEXT:       }
+// CIR-NEXT:       cir.yield
+// CIR-NEXT:  }
 
 // OGCG: define dso_local noundef i32 @_Z3sw4i
 // OGCG: entry:
 // OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
-// OGCG:   switch i32 %[[A_VAL]], label %[[EPILOG:.*]] [
+// OGCG:   switch i32 %[[A_VAL]], label %[[DEFAULT:.*]] [
 // OGCG:     i32 42, label %[[SW42:.*]]
 // OGCG:   ]
 // OGCG: [[SW42]]:
 // OGCG:   br label %[[RETURN:.*]]
-// OGCG: [[EPILOG]]:
+// OGCG: [[DEFAULT]]:
 // OGCG:   br label %[[RETURN]]
 // OGCG: [[RETURN]]:
 // OGCG:   %[[RETVAL_LOAD:.*]] = load i32, ptr %[[RETVAL]], align 4
 // OGCG:   ret i32 %[[RETVAL_LOAD]]
 
-
 void sw5(int a) {
   switch (a) {
   case 1:;
@@ -137,7 +173,7 @@ void sw5(int a) {
 }
 
 // CIR: cir.func @_Z3sw5i
-// CIR: cir.switch (%1 : !s32i) {
+// CIR: cir.switch (%[[A:.*]] : !s32i) {
 // CIR-NEXT:   cir.case(equal, [#cir.int<1> : !s32i]) {
 // CIR-NEXT:     cir.yield
 // CIR-NEXT:   }
@@ -156,22 +192,138 @@ void sw5(int a) {
 // OGCG: [[SW_EPILOG]]:
 // OGCG:   ret void
 
+void sw6(int a) {
+  switch (a) {
+  case 0:
+  case 1:
+  case 2:
+    break;
+  case 3:
+  case 4:
+  case 5:
+    break;
+  }
+}
+
+// CIR: cir.func @_Z3sw6i
+// CIR: cir.switch (%[[A:.*]] : !s32i) {
+// CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<2> : !s32i]) {
+// CIR-NEXT:     cir.break
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<4> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<5> : !s32i]) {
+// CIR-NEXT:     cir.break
+// CIR-NEXT: }
+
+
+// OGCG: define dso_local void @_Z3sw6i
+// OGCG: entry:
+// OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG:   store i32 %a, ptr %[[A_ADDR]], align 4
+// OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG:   switch i32 %[[A_VAL]], label %[[EPILOG:.*]] [
+// OGCG:     i32 0, label %[[BB0:.*]]
+// OGCG:     i32 1, label %[[BB0]]
+// OGCG:     i32 2, label %[[BB0]]
+// OGCG:     i32 3, label %[[BB1:.*]]
+// OGCG:     i32 4, label %[[BB1]]
+// OGCG:     i32 5, label %[[BB1]]
+// OGCG:   ]
+// OGCG: [[BB0]]:
+// OGCG:   br label %[[EPILOG]]
+// OGCG: [[BB1]]:
+// OGCG:   br label %[[EPILOG]]
+// OGCG: [[EPILOG]]:
+// OGCG:   ret void
+
+void sw7(int a) {
+  switch (a) {
+  case 0:
+  case 1:
+  case 2:
+    int x;
+  case 3:
+  case 4:
+  case 5:
+    break;
+  }
+}
+
+// CIR: cir.func @_Z3sw7i
+// CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x"]
+// CIR: cir.switch (%[[A:.*]] : !s32i)
+// CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<2> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<4> : !s32i]) {
+// CIR-NEXT:     cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<5> : !s32i]) {
+// CIR-NEXT:     cir.break
+// CIR-NEXT: }
+// CIR-NEXT: cir.yield
+// CIR: }
+
+// OGCG: define dso_local void @_Z3sw7i
+// OGCG: entry:
+// OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG:   switch i32 %[[A_VAL]], label %[[EPILOG:.*]] [
+// OGCG:     i32 0, label %[[BB0:.*]]
+// OGCG:     i32 1, label %[[BB0]]
+// OGCG:     i32 2, label %[[BB0]]
+// OGCG:     i32 3, label %[[BB1:.*]]
+// OGCG:     i32 4, label %[[BB1]]
+// OGCG:     i32 5, label %[[BB1]]
+// OGCG:   ]
+// OGCG: [[BB0]]:
+// OGCG:   br label %[[BB1]]
+// OGCG: [[BB1]]:
+// OGCG:   br label %[[EPILOG]]
+// OGCG: [[EPILOG]]:
+// OGCG:   ret void
+
+
 void sw8(int a) {
   switch (a)
   {
   case 3:
     break;
   case 4:
-  // TODO: add default case when it is upstreamed
+  default:
     break;
   }
 }
 
 // CIR:    cir.func @_Z3sw8i
-// CIR:      cir.case(equal, [#cir.int<3> : !s32i]) {
+// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 // CIR-NEXT: cir.case(equal, [#cir.int<4> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(default, []) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
 
@@ -180,33 +332,38 @@ void sw8(int a) {
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
-// OGCG:   switch i32 %[[A_VAL]], label %[[EPILOG:.*]] [
+// OGCG:   switch i32 %[[A_VAL]], label %[[DEFAULT:.*]] [
 // OGCG:     i32 3, label %[[SW3:.*]]
 // OGCG:     i32 4, label %[[SW4:.*]]
 // OGCG:   ]
 // OGCG: [[SW3]]:
-// OGCG:   br label %[[EPILOG]]
+// OGCG:   br label %[[EPILOG:.*]]
 // OGCG: [[SW4]]:
+// OGCG:   br label %[[DEFAULT]]
+// OGCG: [[DEFAULT]]:
 // OGCG:   br label %[[EPILOG]]
 // OGCG: [[EPILOG]]:
 // OGCG:   ret void
 
-
 void sw9(int a) {
   switch (a)
   {
   case 3:
     break;
-  // TODO: add default case when it is upstreamed
+  default:
   case 4:
     break;
   }
 }
 
 // CIR:    cir.func @_Z3sw9i
-// CIR:      cir.case(equal, [#cir.int<3> : !s32i]) {
+// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
+// CIR-NEXT: cir.case(default, []) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
 // CIR-NEXT: cir.case(equal, [#cir.int<4> : !s32i]) {
 // CIR-NEXT:   cir.break
 // CIR-NEXT: }
@@ -215,17 +372,123 @@ void sw9(int a) {
 // OGCG: entry:
 // OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
-// OGCG:   switch i32 %[[A_VAL]], label %[[EPILOG:.*]] [
+// OGCG:   switch i32 %[[A_VAL]], label %[[DEFAULT:.*]] [
 // OGCG:     i32 3, label %[[SW3:.*]]
 // OGCG:     i32 4, label %[[SW4:.*]]
 // OGCG:   ]
 // OGCG: [[SW3]]:
-// OGCG:   br label %[[EPILOG]]
+// OGCG:   br label %[[EPILOG:.*]]
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[SW4]]
 // OGCG: [[SW4]]:
 // OGCG:   br label %[[EPILOG]]
 // OGCG: [[EPILOG]]:
 // OGCG:   ret void
 
+void sw10(int a) {
+  switch (a)
+  {
+  case 3:
+    break;
+  case 4:
+  default:
+  case 5:
+    break;
+  }
+}
+
+// CIR:    cir.func @_Z4sw10i
+// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<4> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(default, []) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<5> : !s32i]) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+
+// OGCG: define dso_local void @_Z4sw10i
+// OGCG: entry:
+// OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG:   switch i32 %[[A_VAL]], label %[[DEFAULT:.*]] [
+// OGCG:     i32 3, label %[[BB3:.*]]
+// OGCG:     i32 4, label %[[BB4:.*]]
+// OGCG:     i32 5, label %[[BB5:.*]]
+// OGCG:   ]
+// OGCG: [[BB3]]:
+// OGCG:   br label %[[EPILOG:.*]]
+// OGCG: [[BB4]]:
+// OGCG:   br label %[[DEFAULT]]
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[BB5]]
+// OGCG: [[BB5]]:
+// OGCG:   br label %[[EPILOG]]
+// OGCG: [[EPILOG]]:
+// OGCG:   ret void
+
+void sw11(int a) {
+  switch (a)
+  {
+  case 3:
+    break;
+  case 4:
+  case 5:
+  default:
+  case 6:
+  case 7:
+    break;
+  }
+}
+
+// CIR:    cir.func @_Z4sw11i
+// CIR:    cir.switch (%[[A:.*]] : !s32i)
+// CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<4> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<5> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(default, []) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<6> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<7> : !s32i]) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+
+// OGCG: define dso_local void @_Z4sw11i
+// OGCG: entry:
+// OGCG:   %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[A_VAL:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG:   switch i32 %[[A_VAL]], label %[[DEFAULT:.*]] [
+// OGCG:     i32 3, label %[[BB3:.*]]
+// OGCG:     i32 4, label %[[BB4:.*]]
+// OGCG:     i32 5, label %[[BB4]]
+// OGCG:     i32 6, label %[[BB6:.*]]
+// OGCG:     i32 7, label %[[BB6]]
+// OGCG:   ]
+// OGCG: [[BB3]]:
+// OGCG:   br label %[[EPILOG:.*]]
+// OGCG: [[BB4]]:
+// OGCG:   br label %[[DEFAULT]]
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[BB6]]
+// OGCG: [[BB6]]:
+// OGCG:   br label %[[EPILOG]]
+// OGCG: [[EPILOG]]:
+// OGCG:   ret void
+
 void sw12(int a) {
   switch (a)
   {
@@ -278,7 +541,7 @@ void sw13(int a, int b) {
 // CIR-NEXT:          cir.yield
 // CIR-NEXT:        }
 // CIR-NEXT:      }
-// CIR:         cir.yield
+//      CIR:    cir.yield
 //      CIR:    }
 //      CIR:    cir.return
 
@@ -302,6 +565,114 @@ void sw13(int a, int b) {
 // OGCG: [[EPILOG2]]:
 // OGCG:   ret void
 
+void sw14(int x) {
+  switch (x) {
+    case 1:
+    case 2:
+    case 3 ... 6:
+    case 7:
+      break;
+    default:
+      break;
+  }
+}
+
+// CIR:      cir.func @_Z4sw14i
+// CIR:      cir.switch
+// CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<2> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(range, [#cir.int<3> : !s32i, #cir.int<6> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<7> : !s32i]) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(default, []) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+
+// OGCG: define dso_local void @_Z4sw14i
+// OGCG: entry:
+// OGCG:   %[[X_ADDR:.*]] = alloca i32, align 4
+// OGCG:   store i32 %x, ptr %[[X_ADDR]], align 4
+// OGCG:   %[[X_VAL:.*]] = load i32, ptr %[[X_ADDR]], align 4
+
+// OGCG:   switch i32 %[[X_VAL]], label %[[DEFAULT:.*]] [
+// OGCG-DAG:     i32 1, label %[[BB1:.*]]
+// OGCG-DAG:     i32 2, label %[[BB1]]
+// OGCG-DAG:     i32 3, label %[[BB2:.*]]
+// OGCG-DAG:     i32 4, label %[[BB2]]
+// OGCG-DAG:     i32 5, label %[[BB2]]
+// OGCG-DAG:     i32 6, label %[[BB2]]
+// OGCG-DAG:     i32 7, label %[[BB3:.*]]
+// OGCG:   ]
+// OGCG: [[BB1]]:
+// OGCG:   br label %[[BB2]]
+// OGCG: [[BB2]]:
+// OGCG:   br label %[[BB3]]
+// OGCG: [[BB3]]:
+// OGCG:   br label %[[EPILOG:.*]]
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[EPILOG]]
+// OGCG: [[EPILOG]]:
+// OGCG:   ret void
+
+void sw15(int x) {
+  int y;
+  switch (x) {
+    case 1:
+    case 2:
+      y = 0;
+    case 3:
+      break;
+    default:
+      break;
+  }
+}
+
+// CIR:      cir.func @_Z4sw15i
+// CIR:      %[[Y:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y"]
+// CIR:      cir.switch
+// CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) {
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<2> : !s32i]) {
+// CIR-NEXT:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR-NEXT:   cir.store %[[ZERO]], %[[Y]] : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:   cir.yield
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+// CIR-NEXT: cir.case(default, []) {
+// CIR-NEXT:   cir.break
+// CIR-NEXT: }
+
+// OGCG: define dso_local void @_Z4sw15i
+// OGCG: entry:
+// OGCG:   %[[X_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[Y:.*]] = alloca i32, align 4
+// OGCG:   store i32 %x, ptr %[[X_ADDR]], align 4
+// OGCG:   %[[X_VAL:.*]] = load i32, ptr %[[X_ADDR]], align 4
+// OGCG:   switch i32 %[[X_VAL]], label %[[DEFAULT:.*]] [
+// OGCG-DAG:     i32 1, label %[[BB0:.*]]
+// OGCG-DAG:     i32 2, label %[[BB0]]
+// OGCG-DAG:     i32 3, label %[[BB1:.*]]
+// OGCG:   ]
+// OGCG: [[BB0]]:
+// OGCG:   store i32 0, ptr %[[Y]], align 4
+// OGCG:   br label %[[BB1]]
+// OGCG: [[BB1]]:
+// OGCG:   br label %[[EPILOG:.*]]
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[EPILOG]]
+// OGCG: [[EPILOG]]:
+// OGCG:   ret void
+
 int nested_switch(int a) {
   switch (int b = 1; a) {
   case 0:
@@ -325,7 +696,7 @@ int nested_switch(int a) {
   return 0;
 }
 
-// CIR: cir.switch (%6 : !s32i) {
+// CIR: cir.switch (%[[COND:.*]] : !s32i) {
 // CIR:   cir.case(equal, [#cir.int<0> : !s32i]) {
 // CIR:     cir.yield
 // CIR:   }
diff --git a/clang/test/CIR/CodeGen/union.cpp b/clang/test/CIR/CodeGen/union.cpp
new file mode 100644
index 0000000..24cd93f
--- /dev/null
+++ b/clang/test/CIR/CodeGen/union.cpp
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+// Should generate a union type with all members preserved.
+union U {
+  bool b;
+  short s;
+  int i;
+  float f;
+  double d;
+};
+// CIR: !rec_U = !cir.record<union "U" {!cir.bool, !s16i, !s32i, !cir.float, !cir.double}>
+// LLVM: %union.U = type { double }
+// OGCG: %union.U = type { double }
+
+void shouldGenerateUnionAccess(union U u) {
+  u.b = true;
+  u.b;
+  u.i = 1;
+  u.i;
+  u.f = 0.1F;
+  u.f;
+  u.d = 0.1;
+  u.d;
+}
+// CIR: cir.func {{.*}}shouldGenerateUnionAccess
+// CIR:   %[[#BASE:]] = cir.get_member %0[0] {name = "b"} : !cir.ptr<!rec_U> -> !cir.ptr<!cir.bool>
+// CIR:   cir.store %{{.+}}, %[[#BASE]] : !cir.bool, !cir.ptr<!cir.bool>
+// CIR:   cir.get_member %0[0] {name = "b"} : !cir.ptr<!rec_U> -> !cir.ptr<!cir.bool>
+// CIR:   %[[#BASE:]] = cir.get_member %0[2] {name = "i"} : !cir.ptr<!rec_U> -> !cir.ptr<!s32i>
+// CIR:   cir.store %{{.+}}, %[[#BASE]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[#BASE:]] = cir.get_member %0[2] {name = "i"} : !cir.ptr<!rec_U> -> !cir.ptr<!s32i>
+// CIR:   %[[#BASE:]] = cir.get_member %0[3] {name = "f"} : !cir.ptr<!rec_U> -> !cir.ptr<!cir.float>
+// CIR:   cir.store %{{.+}}, %[[#BASE]] : !cir.float, !cir.ptr<!cir.float>
+// CIR:   %[[#BASE:]] = cir.get_member %0[3] {name = "f"} : !cir.ptr<!rec_U> -> !cir.ptr<!cir.float>
+// CIR:   %[[#BASE:]] = cir.get_member %0[4] {name = "d"} : !cir.ptr<!rec_U> -> !cir.ptr<!cir.double>
+// CIR:   cir.store %{{.+}}, %[[#BASE]] : !cir.double, !cir.ptr<!cir.double>
+// CIR:   %[[#BASE:]] = cir.get_member %0[4] {name = "d"} : !cir.ptr<!rec_U> -> !cir.ptr<!cir.double>
+
+// LLVM: define {{.*}}shouldGenerateUnionAccess
+// LLVM:   %[[BASE:.*]] = alloca %union.U
+// LLVM:   store %union.U %{{.*}}, ptr %[[BASE]]
+// LLVM:   store i8 1, ptr %[[BASE]]
+// LLVM:   store i32 1, ptr %[[BASE]]
+// LLVM:   store float 0x3FB99999A0000000, ptr %[[BASE]]
+// LLVM:   store double 1.000000e-01, ptr %[[BASE]]
+
+// OGCG: define {{.*}}shouldGenerateUnionAccess
+// OGCG:   %[[BASE:.*]] = alloca %union.U
+// OGCG:   %[[DIVE:.*]] = getelementptr inbounds nuw %union.U, ptr %[[BASE]], i32 0, i32 0
+// OGCG:   store i64 %{{.*}}, ptr %[[DIVE]]
+// OGCG:   store i8 1, ptr %[[BASE]]
+// OGCG:   store i32 1, ptr %[[BASE]]
+// OGCG:   store float 0x3FB99999A0000000, ptr %[[BASE]]
+// OGCG:   store double 1.000000e-01, ptr %[[BASE]]
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
index b255a01..d636d1b 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -193,4 +193,134 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
+
+#pragma acc kernels
+  {
+
+#pragma acc loop worker
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK: acc.loop worker {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker(N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) worker(num:N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+
+#pragma acc loop vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK: acc.loop vector {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector device_type(nvidia, radeon) vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: acc.loop vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector(N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) vector(length:N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker vector device_type(nvidia) worker vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+  }
 }
diff --git a/clang/test/CIR/Transforms/select.cir b/clang/test/CIR/Transforms/select.cir
new file mode 100644
index 0000000..29a5d1e
--- /dev/null
+++ b/clang/test/CIR/Transforms/select.cir
@@ -0,0 +1,60 @@
+// RUN: cir-opt -cir-canonicalize -cir-simplify -o %t.cir %s
+// RUN: FileCheck --input-file=%t.cir %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func @fold_true(%arg0 : !s32i, %arg1 : !s32i) -> !s32i {
+    %0 = cir.const #cir.bool<true> : !cir.bool
+    %1 = cir.select if %0 then %arg0 else %arg1 : (!cir.bool, !s32i, !s32i) -> !s32i
+    cir.return %1 : !s32i
+  }
+
+  //      CHECK: cir.func @fold_true(%[[ARG0:.+]]: !s32i, %[[ARG1:.+]]: !s32i) -> !s32i {
+  // CHECK-NEXT:   cir.return %[[ARG0]] : !s32i
+  // CHECK-NEXT: }
+
+  cir.func @fold_false(%arg0 : !s32i, %arg1 : !s32i) -> !s32i {
+    %0 = cir.const #cir.bool<false> : !cir.bool
+    %1 = cir.select if %0 then %arg0 else %arg1 : (!cir.bool, !s32i, !s32i) -> !s32i
+    cir.return %1 : !s32i
+  }
+
+  //      CHECK: cir.func @fold_false(%[[ARG0:.+]]: !s32i, %[[ARG1:.+]]: !s32i) -> !s32i {
+  // CHECK-NEXT:   cir.return %[[ARG1]] : !s32i
+  // CHECK-NEXT: }
+
+  cir.func @fold_to_const(%arg0 : !cir.bool) -> !s32i {
+    %0 = cir.const #cir.int<42> : !s32i
+    %1 = cir.select if %arg0 then %0 else %0 : (!cir.bool, !s32i, !s32i) -> !s32i
+    cir.return %1 : !s32i
+  }
+
+  //      CHECK: cir.func @fold_to_const(%{{.+}}: !cir.bool) -> !s32i {
+  // CHECK-NEXT:   %[[#A:]] = cir.const #cir.int<42> : !s32i
+  // CHECK-NEXT:   cir.return %[[#A]] : !s32i
+  // CHECK-NEXT: }
+
+  cir.func @simplify_1(%arg0 : !cir.bool) -> !cir.bool {
+    %0 = cir.const #cir.bool<true> : !cir.bool
+    %1 = cir.const #cir.bool<false> : !cir.bool
+    %2 = cir.select if %arg0 then %0 else %1 : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
+    cir.return %2 : !cir.bool
+  }
+
+  //      CHECK: cir.func @simplify_1(%[[ARG0:.+]]: !cir.bool) -> !cir.bool {
+  // CHECK-NEXT:   cir.return %[[ARG0]] : !cir.bool
+  // CHECK-NEXT: }
+
+  cir.func @simplify_2(%arg0 : !cir.bool) -> !cir.bool {
+    %0 = cir.const #cir.bool<false> : !cir.bool
+    %1 = cir.const #cir.bool<true> : !cir.bool
+    %2 = cir.select if %arg0 then %0 else %1 : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
+    cir.return %2 : !cir.bool
+  }
+
+  //      CHECK: cir.func @simplify_2(%[[ARG0:.+]]: !cir.bool) -> !cir.bool {
+  // CHECK-NEXT:   %[[#A:]] = cir.unary(not, %[[ARG0]]) : !cir.bool, !cir.bool
+  // CHECK-NEXT:   cir.return %[[#A]] : !cir.bool
+  // CHECK-NEXT: }
+}
diff --git a/clang/test/CIR/Transforms/ternary-fold.cir b/clang/test/CIR/Transforms/ternary-fold.cir
new file mode 100644
index 0000000..1192a0ce
--- /dev/null
+++ b/clang/test/CIR/Transforms/ternary-fold.cir
@@ -0,0 +1,76 @@
+// RUN: cir-opt -cir-canonicalize -cir-simplify -o %t.cir %s
+// RUN: FileCheck --input-file=%t.cir %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func @fold_ternary(%arg0: !s32i, %arg1: !s32i) -> !s32i {
+    %0 = cir.const #cir.bool<false> : !cir.bool
+    %1 = cir.ternary (%0, true {
+      cir.yield %arg0 : !s32i
+    }, false {
+      cir.yield %arg1 : !s32i
+    }) : (!cir.bool) -> !s32i
+    cir.return %1 : !s32i
+  }
+
+  //      CHECK: cir.func @fold_ternary(%{{.+}}: !s32i, %[[ARG:.+]]: !s32i) -> !s32i {
+  // CHECK-NEXT:   cir.return %[[ARG]] : !s32i
+  // CHECK-NEXT: }
+
+  cir.func @simplify_ternary(%arg0 : !cir.bool, %arg1 : !s32i) -> !s32i {
+    %0 = cir.ternary (%arg0, true {
+      %1 = cir.const #cir.int<42> : !s32i
+      cir.yield %1 : !s32i
+    }, false {
+      cir.yield %arg1 : !s32i
+    }) : (!cir.bool) -> !s32i
+    cir.return %0 : !s32i
+  }
+
+  //      CHECK: cir.func @simplify_ternary(%[[ARG0:.+]]: !cir.bool, %[[ARG1:.+]]: !s32i) -> !s32i {
+  // CHECK-NEXT:   %[[#A:]] = cir.const #cir.int<42> : !s32i
+  // CHECK-NEXT:   %[[#B:]] = cir.select if %[[ARG0]] then %[[#A]] else %[[ARG1]] : (!cir.bool, !s32i, !s32i) -> !s32i
+  // CHECK-NEXT:   cir.return %[[#B]] : !s32i
+  // CHECK-NEXT: }
+
+  cir.func @simplify_ternary_false_const(%arg0 : !cir.bool, %arg1 : !s32i) -> !s32i {
+    %0 = cir.ternary (%arg0, true {
+      cir.yield %arg1 : !s32i
+    }, false {
+      %1 = cir.const #cir.int<24> : !s32i
+      cir.yield %1 : !s32i
+    }) : (!cir.bool) -> !s32i
+    cir.return %0 : !s32i
+  }
+
+  //      CHECK: cir.func @simplify_ternary_false_const(%[[ARG0:.+]]: !cir.bool, %[[ARG1:.+]]: !s32i) -> !s32i {
+  // CHECK-NEXT:   %[[#A:]] = cir.const #cir.int<24> : !s32i
+  // CHECK-NEXT:   %[[#B:]] = cir.select if %[[ARG0]] then %[[ARG1]] else %[[#A]] : (!cir.bool, !s32i, !s32i) -> !s32i
+  // CHECK-NEXT:   cir.return %[[#B]] : !s32i
+  // CHECK-NEXT: }
+
+  cir.func @non_simplifiable_ternary(%arg0 : !cir.bool) -> !s32i {
+    %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+    %1 = cir.ternary (%arg0, true {
+      %2 = cir.const #cir.int<42> : !s32i
+      cir.yield %2 : !s32i
+    }, false {
+      %3 = cir.load %0 : !cir.ptr<!s32i>, !s32i
+      cir.yield %3 : !s32i
+    }) : (!cir.bool) -> !s32i
+    cir.return %1 : !s32i
+  }
+
+  //      CHECK: cir.func @non_simplifiable_ternary(%[[ARG0:.+]]: !cir.bool) -> !s32i {
+  // CHECK-NEXT:   %[[#A:]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+  // CHECK-NEXT:   %[[#B:]] = cir.ternary(%[[ARG0]], true {
+  // CHECK-NEXT:     %[[#C:]] = cir.const #cir.int<42> : !s32i
+  // CHECK-NEXT:     cir.yield %[[#C]] : !s32i
+  // CHECK-NEXT:   }, false {
+  // CHECK-NEXT:     %[[#D:]] = cir.load %[[#A]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT:     cir.yield %[[#D]] : !s32i
+  // CHECK-NEXT:   }) : (!cir.bool) -> !s32i
+  // CHECK-NEXT:   cir.return %[[#B]] : !s32i
+  // CHECK-NEXT: }
+}
diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index 6b9ad31..8b84de0 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -96,7 +96,7 @@ namespace cwg108 { // cwg108: 2.9
   template<typename T> struct A {
     struct B { typedef int X; };
     B::X x;
-    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B::X'; implicit 'typename' is a C++20 extension}}
+    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B::X' is a C++20 extension}}
     struct C : B { X x; };
     // expected-error@-1 {{unknown type name 'X'}}
   };
@@ -321,7 +321,7 @@ namespace cwg121 { // cwg121: 2.7
     X::Y<T> x;
     T::Y<T> y;
     // expected-error@-1 {{use 'template' keyword to treat 'Y' as a dependent template name}}
-    // cxx98-17-error@-2 {{missing 'typename' prior to dependent type name 'T::Y'; implicit 'typename' is a C++20 extension}}
+    // cxx98-17-error@-2 {{missing 'typename' prior to dependent type name 'T::Y' is a C++20 extension}}
   };
   Z<X> z;
 } // namespace cwg121
diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index b2ae8f8..a53a8d1 100644
--- a/clang/test/CXX/drs/cwg2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
@@ -426,7 +426,7 @@ namespace cwg224 { // cwg224: 16
       A::type a;
       A<T>::type b;
       A<T*>::type c;
-      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::type'; implicit 'typename' is a C++20 extension}}
+      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::type' is a C++20 extension}}
       ::cwg224::example1::A<T>::type d;
 
       class B {
@@ -435,13 +435,13 @@ namespace cwg224 { // cwg224: 16
         A::type a;
         A<T>::type b;
         A<T*>::type c;
-        // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::type'; implicit 'typename' is a C++20 extension}}
+        // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::type' is a C++20 extension}}
         ::cwg224::example1::A<T>::type d;
 
         B::type e;
         A<T>::B::type f;
         A<T*>::B::type g;
-        // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::B::type'; implicit 'typename' is a C++20 extension}}
+        // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::B::type' is a C++20 extension}}
         typename A<T*>::B::type h;
       };
     };
@@ -450,25 +450,25 @@ namespace cwg224 { // cwg224: 16
       typedef int type;
       A<T*>::type a;
       A<T>::type b;
-      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T>::type'; implicit 'typename' is a C++20 extension}}
+      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T>::type' is a C++20 extension}}
     };
 
     template <class T1, class T2, int I> struct B {
       typedef int type;
       B<T1, T2, I>::type b1;
       B<T2, T1, I>::type b2;
-      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<T2, T1, I>::type'; implicit 'typename' is a C++20 extension}}
+      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<T2, T1, I>::type' is a C++20 extension}}
 
       typedef T1 my_T1;
       static const int my_I = I;
       static const int my_I2 = I+0;
       static const int my_I3 = my_I;
       B<my_T1, T2, my_I>::type b3;
-      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<my_T1, T2, my_I>::type'; implicit 'typename' is a C++20 extension}}
+      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<my_T1, T2, my_I>::type' is a C++20 extension}}
       B<my_T1, T2, my_I2>::type b4;
-      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<my_T1, T2, my_I2>::type'; implicit 'typename' is a C++20 extension}}
+      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<my_T1, T2, my_I2>::type' is a C++20 extension}}
       B<my_T1, T2, my_I3>::type b5;
-      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<my_T1, T2, my_I3>::type'; implicit 'typename' is a C++20 extension}}
+      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'B<my_T1, T2, my_I3>::type' is a C++20 extension}}
     };
   }
 
@@ -480,7 +480,7 @@ namespace cwg224 { // cwg224: 16
       X<A::i, char>::type x;
       X<A<T>::i, double>::type y;
       X<A<T*>::i, long>::type z;
-      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'X<A<T *>::i, long>::type'; implicit 'typename' is a C++20 extension}}
+      // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'X<A<T *>::i, long>::type' is a C++20 extension}}
       int f();
     };
     template <class T> int A<T>::f() {
diff --git a/clang/test/CXX/drs/cwg4xx.cpp b/clang/test/CXX/drs/cwg4xx.cpp
index e8e2600..210f7ae 100644
--- a/clang/test/CXX/drs/cwg4xx.cpp
+++ b/clang/test/CXX/drs/cwg4xx.cpp
@@ -257,7 +257,7 @@ namespace cwg409 { // cwg409: 2.7
     A::B b2;
     A<T>::B b3;
     A<T*>::B b4;
-    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::B'; implicit 'typename' is a C++20 extension}}
+    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'A<T *>::B' is a C++20 extension}}
   };
 } // namespace cwg409
 
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 0825b52..1d505ad 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -254,9 +254,9 @@ namespace cwg526 { // cwg526: 2.7
     typedef int type;
     X<N>::type v1;
     X<(N)>::type v2;
-    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'X<(N)>::type'; implicit 'typename' is a C++20 extension}}
+    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'X<(N)>::type' is a C++20 extension}}
     X<+N>::type v3;
-    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'X<+N>::type'; implicit 'typename' is a C++20 extension}}
+    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'X<+N>::type' is a C++20 extension}}
   };
 } // namespace cwg526
 
@@ -783,7 +783,7 @@ struct Outer {
 };
 template <class T>
 Outer<T>::Inner* Outer<T>::Inner::self() { return this; }
-// cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'Outer<T>::Inner'; implicit 'typename' is a C++20 extension}}
+// cxx98-17-error@-1 {{missing 'typename' prior to dependent type name 'Outer<T>::Inner' is a C++20 extension}}
 
 } // namespace cwg560
 
diff --git a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p1.cpp b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p1.cpp
index 910dab1..acaeea9 100644
--- a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p1.cpp
+++ b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p1.cpp
@@ -17,7 +17,7 @@ namespace Example1 {
 
   template<class T> struct A<A<A<T>>> {
     struct C {};
-    B<B<T>>::C bc; // expected-warning {{implicit 'typename' is a C++20 extension}}
+    B<B<T>>::C bc; // expected-warning {{missing 'typename' prior to dependent type name 'B<B<T>>::C' is a C++20 extension}}
   };
 }
 
diff --git a/clang/test/ClangScanDeps/modules-canononical-module-map-case.c b/clang/test/ClangScanDeps/modules-canononical-module-map-case.c
index ccb0653d..34d5949 100644
--- a/clang/test/ClangScanDeps/modules-canononical-module-map-case.c
+++ b/clang/test/ClangScanDeps/modules-canononical-module-map-case.c
@@ -36,6 +36,17 @@ framework module FW {
       ],
       "name": "DIR/frameworks/FW.framework/Headers",
       "type": "directory"
+    },
+    {
+      "contents": [
+        {
+          "external-contents": "DIR/frameworks/FW.framework/Modules/module.modulemap",
+          "name": "module.modulemap",
+          "type": "file"
+        }
+      ],
+      "name": "DIR/frameworks/FW.framework/Modules",
+      "type": "directory"
     }
   ]
 }
diff --git a/clang/test/CodeGen/bounds-checking-debuginfo.c b/clang/test/CodeGen/bounds-checking-debuginfo.c
index 4f5ba2b..74c0666 100644
--- a/clang/test/CodeGen/bounds-checking-debuginfo.c
+++ b/clang/test/CodeGen/bounds-checking-debuginfo.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -mllvm -array-bounds-pseudofn -emit-llvm -fdebug-prefix-map=%S/= -fno-ident -fdebug-compilation-dir=%S -fsanitize=array-bounds -fsanitize-trap=array-bounds -triple x86_64 -debug-info-kind=limited %s -o - | FileCheck --check-prefix=CHECK-TRAP %s
-// RUN: %clang_cc1 -mllvm -array-bounds-pseudofn -emit-llvm -fdebug-prefix-map=%S/= -fno-ident -fdebug-compilation-dir=%S -fsanitize=array-bounds                              -triple x86_64 -debug-info-kind=limited %s -o - | FileCheck --check-prefix=CHECK-NOTRAP %s
 
+// RUN: %clang_cc1 -emit-llvm -fdebug-prefix-map=%S/= -fno-ident -fdebug-compilation-dir=%S -fsanitize=array-bounds -fsanitize-trap=array-bounds -fsanitize-annotate-debug-info=array-bounds -triple x86_64 -debug-info-kind=limited %s -o - | FileCheck --check-prefix=CHECK-TRAP %s
+// RUN: %clang_cc1 -emit-llvm -fdebug-prefix-map=%S/= -fno-ident -fdebug-compilation-dir=%S -fsanitize=array-bounds                              -fsanitize-annotate-debug-info=array-bounds -triple x86_64 -debug-info-kind=limited %s -o - | FileCheck --check-prefix=CHECK-NOTRAP %s
 
 int f();
 void d(double*);
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 773daf5..a0e11a1 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -492,7 +492,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    ret void
 //
 //
-// GFX900: Function Attrs: convergent norecurse nounwind
+// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
 // GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test(
 // GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
@@ -640,7 +640,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    ret void
 //
 //
-// GFX900: Function Attrs: convergent norecurse nounwind
+// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
 // GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel(
 // GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
@@ -832,7 +832,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
 // GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
 // GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
-// GFX900: attributes #[[ATTR3]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
+// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
 // GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
 // GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 // GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index d12dcea..b94e1e7 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -106,7 +106,7 @@
 // GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" "uniform-work-group-size"="true"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 8251d6c..5adfdb6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1141,11 +1141,9 @@ void test_cvt_scalef32_pk_fp4_f16(global unsigned int* out, half2 src, float sca
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
 // CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
-// CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
 // CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
-// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
@@ -1176,7 +1174,7 @@ void test_cvt_scalef32_pk_fp4_f16(global unsigned int* out, half2 src, float sca
 // CHECK-NEXT:    store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
 // CHECK-NEXT:    ret void
 //
-void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float scale, uint old)
+void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float scale)
 {
   *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 0);
   *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 1);
diff --git a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
index 5f32231..98587c6 100644
--- a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
+++ b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
@@ -5,7 +5,6 @@
 
 kernel void ker() {};
 // CHECK: define{{.*}}@ker() #[[ATTR0:[0-9]+]]
-// CHECK: call void @__clang_ocl_kern_imp_ker() #[[ATTR2:[0-9]+]]
 
 // CHECK: define{{.*}}@__clang_ocl_kern_imp_ker() #[[ATTR1:[0-9]+]]
 
@@ -18,6 +17,3 @@ void foo() {};
 
 // CHECK: attributes #[[ATTR1]]
 // CHECK-NOT: uniform-work-group-size
-
-// CHECK: attributes #[[ATTR2]]
-// CHECK-NOT: uniform-work-group-size
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
index 3355fe1..6c85e73 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86 
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLEX86 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86 
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefix=CHECK-LIFETIMES 
 
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
@@ -39,12 +39,6 @@ void callee(int id, __global int *out) {
   out[id] = id;
 }
 
-// TRIPLESPIR: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
-// TRIPLESPIR:    call spir_func void @__clang_ocl_kern_imp_device_side_enqueue({{.*}})
-
-// TRIPLEX86: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
-// TRIPLEX86:    call void @__clang_ocl_kern_imp_device_side_enqueue({{.*}})
-
 // COMMON-LABEL: define{{.*}} void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
   // SPIR: %default_queue = alloca target("spirv.Queue")
diff --git a/clang/test/CodeGenOpenCL/convergent.cl b/clang/test/CodeGenOpenCL/convergent.cl
index 123adba..53a35a4 100644
--- a/clang/test/CodeGenOpenCL/convergent.cl
+++ b/clang/test/CodeGenOpenCL/convergent.cl
@@ -127,7 +127,7 @@ void test_not_unroll() {
 // CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]]
 
 // CHECK-LABEL: @assume_convergent_asm
-// CHECK: tail call void asm sideeffect "s_barrier", ""() #5
+// CHECK: tail call void asm sideeffect "s_barrier", ""() #6
 kernel void assume_convergent_asm()
 {
   __asm__ volatile("s_barrier");
@@ -138,6 +138,7 @@ kernel void assume_convergent_asm()
 // CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} }
 // CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
 // CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #5 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #6 = { {{[^}]*}}nounwind{{[^}]*}} }
-// CHECK: attributes #7 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} }
+// CHECK: attributes #5 = { {{[^}]*}}alwaysinline convergent{{[^}]*}} }
+// CHECK: attributes #6 = { {{[^}]*}}convergent{{[^}]*}} }
+// CHECK: attributes #7 = { {{[^}]*}}nounwind{{[^}]*}} }
+// CHECK: attributes #8 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} }
diff --git a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
index e741cf6..8e970f1 100644
--- a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
+++ b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -9,15 +9,8 @@
 typedef struct {int a;} ndrange_t;
 
 kernel void test(int i) {
-
 // AMDGPU-LABEL: define {{.*}} amdgpu_kernel void @test
-// AMDGPU-LABEL: call void @__clang_ocl_kern_imp_test(i32 noundef %0)
-
 // SPIR-LABEL: define {{.*}} spir_kernel void @test
-// SPIR-LABEL: call spir_func void @__clang_ocl_kern_imp_test(i32 noundef %0)
-
-// AMDGPU-LABEL: define {{.*}} void @__clang_ocl_kern_imp_test
-// SPIR-LABEL: define {{.*}} spir_func void @__clang_ocl_kern_imp_test
 
 // COMMON-LABEL: entry:
 // AMDGPU: %block_sizes = alloca [1 x i64]
@@ -44,5 +37,5 @@ kernel void test(int i) {
 
 // CHECK-DEBUG: ![[TESTFILE:[0-9]+]] = !DIFile(filename: "<stdin>"
 // CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", linkageName: "__clang_ocl_kern_imp_test", {{.*}} file: ![[TESTFILE]]
-// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 33)
-// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 34, scope: ![[IFSCOPE]])
+// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 26)
+// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 27, scope: ![[IFSCOPE]])
diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
index a5b2bee1..cd0839f 100644
--- a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
+++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
@@ -70,6 +70,14 @@ __attribute__((noinline)) kernel void callee_kern(global int *A){
   *A = 1;
 }
 
+__attribute__((optnone)) kernel void callee_kern_with_optnone_attribute(global int *A){
+  *A = 1;
+}
+
+__attribute__((always_inline)) kernel void callee_kern_with_alwaysinline_attribute(global int *A){
+  *A = 1;
+}
+
 kernel void callee_kern_Mat3X3(global Mat3X3 *in, global Mat4X4 *out) {
   out[0] = foo(in[1]);
 }
@@ -111,7 +119,8 @@ kernel void ext_KernelLargeTwoMember(struct LargeStructTwoMember u);
 kernel void caller_kern(global int* A, global Mat3X3 *mat3X3, global Mat4X4 *mat4X4, global Mat32X32 *mat32X32, global Mat64X64 *mat64X64){
   callee_kern(A);
   ext_callee_kern(A);
-
+  callee_kern_with_optnone_attribute(A);
+  callee_kern_with_alwaysinline_attribute(A);
   callee_kern_Mat3X3(mat3X3, mat4X4);
   callee_kern_Mat32X32(mat32X32, mat64X64);
   ext_callee_kern_Mat3X3(mat3X3, mat4X4);
@@ -214,7 +223,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR4:[0-9]+]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR6:[0-9]+]]
 // X86-NEXT:    ret void
 //
 //
@@ -230,6 +239,53 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 //
 //
 // X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define spir_kernel void @callee_kern_with_optnone_attribute(
+// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_with_optnone_attribute(ptr noundef align 4 [[TMP0]]) #[[ATTR6]]
+// X86-NEXT:    ret void
+//
+//
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
+// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_with_optnone_attribute(
+// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 1, ptr [[TMP0]], align 4
+// X86-NEXT:    ret void
+//
+//
+// X86: Function Attrs: alwaysinline convergent norecurse nounwind
+// X86-LABEL: define spir_kernel void @callee_kern_with_alwaysinline_attribute(
+// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4
+// X86-NEXT:    store i32 1, ptr [[TMP1]], align 4
+// X86-NEXT:    ret void
+//
+//
+// X86: Function Attrs: alwaysinline convergent norecurse nounwind
+// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_with_alwaysinline_attribute(
+// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 1, ptr [[TMP0]], align 4
+// X86-NEXT:    ret void
+//
+//
+// X86: Function Attrs: convergent noinline norecurse nounwind optnone
 // X86-LABEL: define spir_kernel void @callee_kern_Mat3X3(
 // X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] {
 // X86-NEXT:  entry:
@@ -239,7 +295,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4
 // X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -256,7 +312,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr [[TMP0]], i32 0
 // X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4
 // X86-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP1]], i32 1
-// X86-NEXT:    call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1]]) #[[ATTR4]]
+// X86-NEXT:    call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1]]) #[[ATTR6]]
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
 // X86-NEXT:    ret void
 //
@@ -271,7 +327,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4
 // X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -288,7 +344,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr [[TMP0]], i32 0
 // X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4
 // X86-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP1]], i32 1
-// X86-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1]]) #[[ATTR4]]
+// X86-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1]]) #[[ATTR6]]
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
 // X86-NEXT:    ret void
 //
@@ -297,7 +353,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-LABEL: define spir_kernel void @KernelOneMember(
 // X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -307,7 +363,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:  entry:
 // X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
-// X86-NEXT:    call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -315,7 +371,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-LABEL: define spir_kernel void @KernelLargeOneMember(
 // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -325,7 +381,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:  entry:
 // X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
-// X86-NEXT:    call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -333,7 +389,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-LABEL: define spir_kernel void @KernelTwoMember(
 // X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -343,7 +399,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:  entry:
 // X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false)
-// X86-NEXT:    call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -351,7 +407,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-LABEL: define spir_kernel void @KernelLargeTwoMember(
 // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -361,7 +417,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:  entry:
 // X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false)
-// X86-NEXT:    call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]]
+// X86-NEXT:    call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -384,7 +440,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4
 // X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4
 // X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_caller_kern(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]], ptr noundef align 4 [[TMP2]], ptr noundef align 4 [[TMP3]], ptr noundef align 4 [[TMP4]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_caller_kern(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]], ptr noundef align 4 [[TMP2]], ptr noundef align 4 [[TMP3]], ptr noundef align 4 [[TMP4]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -392,6 +448,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern(
 // X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] {
 // X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR_I:%.*]] = alloca ptr, align 4
 // X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // X86-NEXT:    [[MAT3X3_ADDR:%.*]] = alloca ptr, align 4
 // X86-NEXT:    [[MAT4X4_ADDR:%.*]] = alloca ptr, align 4
@@ -403,21 +460,27 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    store ptr [[MAT32X32]], ptr [[MAT32X32_ADDR]], align 4
 // X86-NEXT:    store ptr [[MAT64X64]], ptr [[MAT64X64_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR6]]
 // X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP1]]) #[[ATTR4]]
-// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4
-// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP2]], ptr noundef align 4 [[TMP3]]) #[[ATTR4]]
-// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4
-// X86-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP4]], ptr noundef align 4 [[TMP5]]) #[[ATTR4]]
-// X86-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4
-// X86-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP6]], ptr noundef align 4 [[TMP7]]) #[[ATTR4]]
-// X86-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4
-// X86-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP8]], ptr noundef align 4 [[TMP9]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP1]]) #[[ATTR6]]
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_with_optnone_attribute(ptr noundef align 4 [[TMP2]]) #[[ATTR6]]
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP3]], ptr [[A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_I]], align 4
+// X86-NEXT:    store i32 1, ptr [[TMP4]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4
+// X86-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP5]], ptr noundef align 4 [[TMP6]]) #[[ATTR6]]
+// X86-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4
+// X86-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4
+// X86-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP7]], ptr noundef align 4 [[TMP8]]) #[[ATTR6]]
+// X86-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4
+// X86-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4
+// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP9]], ptr noundef align 4 [[TMP10]]) #[[ATTR6]]
+// X86-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4
+// X86-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4
+// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP11]], ptr noundef align 4 [[TMP12]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -428,7 +491,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr, align 4
 // X86-NEXT:    store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4
 // X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]], ptr noundef align 8 [[TMP0]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]], ptr noundef align 8 [[TMP0]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -442,10 +505,10 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTONEMEM]], ptr align 4 [[TMP0]], i32 8, i1 false)
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 16, i1 false)
 // X86-NEXT:    store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR4]]
-// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR4]]
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]]
-// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR6]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR6]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR6]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -453,7 +516,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-LABEL: define spir_kernel void @caller_kern3(
 // X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[LARGESTRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[LARGESTRUCTTWOMEM:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] {
 // X86-NEXT:  entry:
-// X86-NEXT:    call void @__clang_ocl_kern_imp_caller_kern3(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_caller_kern3(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -465,10 +528,10 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // X86-NEXT:    [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTONEMEM]], ptr align 4 [[TMP0]], i32 800, i1 false)
 // X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 480, i1 false)
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR4]]
-// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]]
-// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR4]]
-// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR6]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR6]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR6]]
+// X86-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR6]]
 // X86-NEXT:    ret void
 //
 //
@@ -562,7 +625,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR5:[0-9]+]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR7:[0-9]+]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -578,6 +641,53 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 //
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_with_optnone_attribute(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_with_optnone_attribute(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR7]]
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
+// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_with_optnone_attribute(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr addrspace(1) [[TMP0]], align 4
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_with_alwaysinline_attribute(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[TMP0]], ptr addrspace(5) [[A_ADDR_I]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr addrspace(1) [[TMP1]], align 4
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN: Function Attrs: alwaysinline convergent norecurse nounwind
+// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_with_alwaysinline_attribute(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr addrspace(1) [[TMP0]], align 4
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_Mat3X3(
 // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] {
 // AMDGCN-NEXT:  entry:
@@ -587,7 +697,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -606,7 +716,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
-// AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR5]]
+// AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR7]]
 // AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
 // AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
@@ -624,7 +734,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -643,7 +753,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
-// AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR7]]
 // AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
 // AMDGCN-NEXT:    ret void
 //
@@ -657,7 +767,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
 // AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -670,7 +780,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
 // AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
-// AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -680,7 +790,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -690,7 +800,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -704,7 +814,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8
 // AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -721,7 +831,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8
 // AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
-// AMDGCN-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -731,7 +841,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -741,7 +851,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -764,7 +874,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]], ptr addrspace(1) noundef align 4 [[TMP2]], ptr addrspace(1) noundef align 4 [[TMP3]], ptr addrspace(1) noundef align 4 [[TMP4]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]], ptr addrspace(1) noundef align 4 [[TMP2]], ptr addrspace(1) noundef align 4 [[TMP3]], ptr addrspace(1) noundef align 4 [[TMP4]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -772,6 +882,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern(
 // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]], ptr addrspace(1) noundef align 4 [[MAT3X3:%.*]], ptr addrspace(1) noundef align 4 [[MAT4X4:%.*]], ptr addrspace(1) noundef align 4 [[MAT32X32:%.*]], ptr addrspace(1) noundef align 4 [[MAT64X64:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] {
 // AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[A_ADDR_I:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[MAT3X3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[MAT4X4_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -783,21 +894,27 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    store ptr addrspace(1) [[MAT32X32]], ptr addrspace(5) [[MAT32X32_ADDR]], align 8
 // AMDGCN-NEXT:    store ptr addrspace(1) [[MAT64X64]], ptr addrspace(5) [[MAT64X64_ADDR]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR7]]
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]]
-// AMDGCN-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8
-// AMDGCN-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP2]], ptr addrspace(1) noundef align 4 [[TMP3]]) #[[ATTR5]]
-// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP4]], ptr addrspace(1) noundef align 4 [[TMP5]]) #[[ATTR5]]
-// AMDGCN-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP6]], ptr addrspace(1) noundef align 4 [[TMP7]]) #[[ATTR5]]
-// AMDGCN-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8
-// AMDGCN-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP8]], ptr addrspace(1) noundef align 4 [[TMP9]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR7]]
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_with_optnone_attribute(ptr addrspace(1) noundef align 4 [[TMP2]]) #[[ATTR7]]
+// AMDGCN-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[TMP3]], ptr addrspace(5) [[A_ADDR_I]], align 8
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR_I]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr addrspace(1) [[TMP4]], align 4
+// AMDGCN-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP5]], ptr addrspace(1) noundef align 4 [[TMP6]]) #[[ATTR7]]
+// AMDGCN-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP7]], ptr addrspace(1) noundef align 4 [[TMP8]]) #[[ATTR7]]
+// AMDGCN-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP9]], ptr addrspace(1) noundef align 4 [[TMP10]]) #[[ATTR7]]
+// AMDGCN-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP11]], ptr addrspace(1) noundef align 4 [[TMP12]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -819,7 +936,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
 // AMDGCN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP2]], ptr addrspace(1) noundef align 8 [[TMP1]], <2 x i32> [[TMP4]], <2 x i32> [[TMP6]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP2]], ptr addrspace(1) noundef align 8 [[TMP1]], <2 x i32> [[TMP4]], <2 x i32> [[TMP6]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -839,20 +956,20 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8
 // AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP2]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP2]]) #[[ATTR7]]
 // AMDGCN-NEXT:    [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP3]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP3]]) #[[ATTR7]]
 // AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
 // AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR7]]
 // AMDGCN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8
 // AMDGCN-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
 // AMDGCN-NEXT:    [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -864,7 +981,7 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], ptr addrspace(4) align 8 [[TMP1]], i64 480, i1 false)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern3(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern3(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -876,10 +993,10 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 // AMDGCN-NEXT:    [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], ptr addrspace(5) align 8 [[TMP1]], i64 480, i1 false)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR5]]
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]]
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR5]]
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR7]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR7]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR7]]
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR7]]
 // AMDGCN-NEXT:    ret void
 //
 //.
diff --git a/clang/test/Driver/fsanitize-coverage.c b/clang/test/Driver/fsanitize-coverage.c
index dc4c393..2ef452b 100644
--- a/clang/test/Driver/fsanitize-coverage.c
+++ b/clang/test/Driver/fsanitize-coverage.c
@@ -93,6 +93,20 @@
 // CHECK-STACK-DEPTH-PC-GUARD: -fsanitize-coverage-trace-pc-guard
 // CHECK-STACK-DEPTH-PC-GUARD: -fsanitize-coverage-stack-depth
 
+// RUN: %clang --target=x86_64-linux-gnu \
+// RUN:     -fsanitize-coverage-stack-depth-callback-min=100 %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CHECK-STACK-DEPTH-CALLBACK
+// RUN: %clang --target=x86_64-linux-gnu \
+// RUN:     -fsanitize-coverage-stack-depth-callback-min=0 %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CHECK-STACK-DEPTH-CALLBACK-ZERO
+// RUN: not %clang --target=x86_64-linux-gnu \
+// RUN:     -fsanitize-coverage-stack-depth-callback-min=-10 %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CHECK-STACK-DEPTH-CALLBACK-NEGATIVE
+// CHECK-STACK-DEPTH-CALLBACK-NOT: error:
+// CHECK-STACK-DEPTH-CALLBACK: -fsanitize-coverage-stack-depth-callback-min=100
+// CHECK-STACK-DEPTH-CALLBACK-ZERO-NOT: -fsanitize-coverage-stack-depth-callback-min=0
+// CHECK-STACK-DEPTH-CALLBACK-NEGATIVE: error: invalid value '-10' in '-fsanitize-coverage-stack-depth-callback-min=-10'
+
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=trace-cmp,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TYPE-NECESSARY
 // CHECK-NO-TYPE-NECESSARY-NOT: error:
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-indirect-calls
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index eb72140..24d64c9 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -1,3 +1,5 @@
+// * Test -fsanitize-trap */
+
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-TRAP
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-trap=signed-integer-overflow %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-TRAP2
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-undefined-trap-on-error %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-TRAP
@@ -9,6 +11,9 @@
 // CHECK-UNDEFINED-TRAP: "-fsanitize-trap=alignment,array-bounds,bool,builtin,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,pointer-overflow,return,returns-nonnull-attribute,shift-base,shift-exponent,signed-integer-overflow,unreachable,vla-bound"
 // CHECK-UNDEFINED-TRAP2: "-fsanitize-trap=alignment,array-bounds,bool,builtin,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,pointer-overflow,return,returns-nonnull-attribute,shift-base,shift-exponent,unreachable,vla-bound"
 
+
+// * Test -fsanitize-merge *
+
 // The trailing -fsanitize-merge takes precedence
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                                                              %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-MERGE
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                             -fsanitize-merge                                 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-MERGE
@@ -62,6 +67,59 @@
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-merge=signed-integer-overflow -fno-sanitize-merge=undefined -fsanitize-merge=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-MERGE5
 // CHECK-UNDEFINED-MERGE5: "-fsanitize-merge=alignment,null"
 
+
+// * Test -fsanitize-annotate-debug-info *
+
+// The trailing -fsanitize-annotate-debug-info takes precedence
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                            -fsanitize-annotate-debug-info                                 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                            -fsanitize-annotate-debug-info=undefined                       %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info                         -fsanitize-annotate-debug-info                                 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info                         -fsanitize-annotate-debug-info=undefined                       %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info=undefined               -fsanitize-annotate-debug-info                                 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info=undefined               -fsanitize-annotate-debug-info=undefined                       %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow -fsanitize-annotate-debug-info                                 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info=bool                       -fsanitize-annotate-debug-info=undefined                       %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                            -fsanitize-annotate-debug-info=undefined -fsanitize-annotate-debug-info=bool %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO
+// CHECK-UNDEFINED-PSEUDO: "-fsanitize-annotate-debug-info=alignment,array-bounds,bool,builtin,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,pointer-overflow,return,returns-nonnull-attribute,shift-base,shift-exponent,signed-integer-overflow,unreachable,vla-bound"
+
+// The trailing arguments (-fsanitize-annotate-debug-info -fno-sanitize-annotate-debug-info=signed-integer-overflow) take precedence
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                            -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=signed-integer-overflow %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO2
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                            -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO2
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info                         -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=signed-integer-overflow %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO2
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info                         -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO2
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=signed-integer-overflow %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO2
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO2
+// CHECK-UNDEFINED-PSEUDO2: "-fsanitize-annotate-debug-info=alignment,array-bounds,bool,builtin,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,pointer-overflow,return,returns-nonnull-attribute,shift-base,shift-exponent,unreachable,vla-bound"
+
+// The trailing -fno-sanitize-annotate-debug-info takes precedence
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                           -fno-sanitize-annotate-debug-info                                    %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                           -fno-sanitize-annotate-debug-info=undefined                          %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                           -fno-sanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=bool %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                           -fno-sanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=bool %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info                                    %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=undefined                          %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info                                    %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=undefined                          %s -### 2>&1 | not FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO3
+// CHECK-UNDEFINED-PSEUDO3: "-fsanitize-annotate-debug-info"
+
+// The trailing arguments (-fsanitize-annotate-debug-info -fno-sanitize-annotate-debug-info=alignment,null) take precedence
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                            -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO4
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                            -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO4
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info                         -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO4
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info                         -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO4
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow -fsanitize-annotate-debug-info           -fno-sanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO4
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fno-sanitize-annotate-debug-info=signed-integer-overflow -fsanitize-annotate-debug-info=undefined -fno-sanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO4
+// CHECK-UNDEFINED-PSEUDO4: "-fsanitize-annotate-debug-info=array-bounds,bool,builtin,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,pointer-overflow,return,returns-nonnull-attribute,shift-base,shift-exponent,signed-integer-overflow,unreachable,vla-bound"
+
+// The trailing arguments (-fno-sanitize-annotate-debug-info -fsanitize-annotate-debug-info=alignment,null) take precedence
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined                                                         -fno-sanitize-annotate-debug-info=undefined -fsanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO5
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info                         -fno-sanitize-annotate-debug-info           -fsanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO5
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info                         -fno-sanitize-annotate-debug-info=undefined -fsanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO5
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info=signed-integer-overflow -fno-sanitize-annotate-debug-info           -fsanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO5
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -fsanitize-annotate-debug-info=signed-integer-overflow -fno-sanitize-annotate-debug-info=undefined -fsanitize-annotate-debug-info=alignment,null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-PSEUDO5
+// CHECK-UNDEFINED-PSEUDO5: "-fsanitize-annotate-debug-info=alignment,null"
+
+
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED
 // CHECK-UNDEFINED: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|function|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|pointer-overflow|float-cast-overflow|array-bounds|enum|bool|builtin|returns-nonnull-attribute|nonnull-attribute),?){18}"}}
 
diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index 99baa46..1235d08 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -38,7 +38,7 @@
 /* Verify that the correct vector library is passed to LTO flags. */
 
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=libmvec -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC %s
-// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86"
+// CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC"
 
 // RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-MASSV %s
 // CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV"
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index c6c8b54..51c2eff 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -1,5 +1,5 @@
 // RUN: not %clang -### -c --target=i686 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=ERR-MEDIUM %s
-// RUN: %clang --target=x86_64 -### -c -mcmodel=tiny %s 2>&1 | FileCheck --check-prefix=TINY %s
+// RUN: not %clang --target=x86_64 -### -c -mcmodel=tiny %s 2>&1 | FileCheck --check-prefix=ERR-TINY %s
 // RUN: %clang --target=x86_64 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
 // RUN: %clang --target=x86_64 -### -S -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=KERNEL %s
 // RUN: %clang --target=x86_64 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
diff --git a/clang/test/Driver/mips-cpus.c b/clang/test/Driver/mips-cpus.c
new file mode 100644
index 0000000..2e988e5
--- /dev/null
+++ b/clang/test/Driver/mips-cpus.c
@@ -0,0 +1,9 @@
+// Check target CPUs are correctly passed.
+
+// RUN: %clang --target=mips64 -### -c %s 2>&1 -mcpu=i6400 | FileCheck -check-prefix=MCPU-I6400 %s
+// MCPU-I6400: "-target-cpu" "i6400"
+// MCPU-I6400: "-target-feature" "+msa" "-target-feature" "-noabicalls"
+
+// RUN: %clang --target=mips64 -### -c %s 2>&1 -mcpu=i6500 | FileCheck -check-prefix=MCPU-I6500 %s
+// MCPU-I6500: "-target-cpu" "i6500"
+// MCPU-I6500: "-target-feature" "+msa" "-target-feature" "-noabicalls"
diff --git a/clang/test/FixIt/fixit.cpp b/clang/test/FixIt/fixit.cpp
index 605c2d0..3e50409 100644
--- a/clang/test/FixIt/fixit.cpp
+++ b/clang/test/FixIt/fixit.cpp
@@ -211,7 +211,7 @@ public:
 template<class T> struct Mystery;
 template<class T> typedef Mystery<T>::type getMysteriousThing() { // \
   expected-error {{function definition declared 'typedef'}} \
-  expected-warning {{implicit 'typename' is a C++20 extension}}
+  expected-warning {{missing 'typename' prior to dependent type name 'Mystery<T>::type' is a C++20 extension}}
   return Mystery<T>::get();
 }
 
diff --git a/clang/test/Modules/Inputs/shadow/A1/A1.h b/clang/test/Modules/Inputs/shadow/A1/A1.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Modules/Inputs/shadow/A1/A1.h
diff --git a/clang/test/Modules/Inputs/shadow/A1/module.modulemap b/clang/test/Modules/Inputs/shadow/A1/module.modulemap
index 9439a43..3a47280 100644
--- a/clang/test/Modules/Inputs/shadow/A1/module.modulemap
+++ b/clang/test/Modules/Inputs/shadow/A1/module.modulemap
@@ -2,4 +2,6 @@ module A {
   header "A.h"
 }
 
-module A1 {}
+module A1 {
+  header "A1.h"
+}
diff --git a/clang/test/Modules/Inputs/shadow/A2/A2.h b/clang/test/Modules/Inputs/shadow/A2/A2.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Modules/Inputs/shadow/A2/A2.h
diff --git a/clang/test/Modules/Inputs/shadow/A2/module.modulemap b/clang/test/Modules/Inputs/shadow/A2/module.modulemap
index 935d89b..9e6fe64 100644
--- a/clang/test/Modules/Inputs/shadow/A2/module.modulemap
+++ b/clang/test/Modules/Inputs/shadow/A2/module.modulemap
@@ -2,4 +2,6 @@ module A {
   header "A.h"
 }
 
-module A2 {}
+module A2 {
+  header "A2.h"
+}
diff --git a/clang/test/Modules/befriend.cppm b/clang/test/Modules/befriend.cppm
new file mode 100644
index 0000000..3a251f1
--- /dev/null
+++ b/clang/test/Modules/befriend.cppm
@@ -0,0 +1,41 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cpp -fmodule-file=a=%t/a.pcm -emit-llvm -o /dev/null -verify
+
+//--- a.cppm
+module;
+
+namespace n
+{
+
+template<typename>
+struct a {
+	template<typename T>
+	friend void aa(a<T>);
+};
+
+template<typename T>
+inline void aa(a<T>) {
+}
+
+} //namespace n
+
+export module a;
+
+namespace n {
+
+export using n::a;
+export using n::aa;
+
+}
+
+//--- b.cpp
+// expected-no-diagnostics
+import a;
+
+void b() {
+	aa(n::a<int>());
+}
diff --git a/clang/test/Modules/lazy-by-name-lookup.c b/clang/test/Modules/lazy-by-name-lookup.c
new file mode 100644
index 0000000..11a3a5c
--- /dev/null
+++ b/clang/test/Modules/lazy-by-name-lookup.c
@@ -0,0 +1,31 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -I%t \
+// RUN:   -fmodules-cache-path=%t/cache %t/tu.c -fsyntax-only -Rmodule-map \
+// RUN:   -verify
+
+//--- module.modulemap
+
+module A {
+  header "A.h"
+}
+
+module B {
+  header "B.h"
+}
+
+//--- A.h
+
+//--- B.h
+
+//--- tu.c
+
+#pragma clang __debug module_lookup A // does module map search for A
+#pragma clang __debug module_map A // A is now in the ModuleMap,
+#pragma clang __debug module_map B // expected-warning{{unknown module 'B'}}
+                                   // but B isn't.
+#include <B.h> // Now load B via header search
+
+// expected-remark@*{{parsing modulemap}}
+// expected-remark@*{{loading parsed module 'A'}}
+// expected-remark@*{{loading modulemap}}
+\ No newline at end of file
diff --git a/clang/test/Modules/shadow.m b/clang/test/Modules/shadow.m
index 44320af..c45d018 100644
--- a/clang/test/Modules/shadow.m
+++ b/clang/test/Modules/shadow.m
@@ -1,13 +1,14 @@
 // RUN: rm -rf %t
-// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs/shadow/A1 -I %S/Inputs/shadow/A2 %s -fsyntax-only 2>&1 | FileCheck %s -check-prefix=REDEFINITION
-// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/shadow/A1/module.modulemap -fmodule-map-file=%S/Inputs/shadow/A2/module.modulemap %s -fsyntax-only 2>&1 | FileCheck %s -check-prefix=REDEFINITION
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs/shadow/A1 -I %S/Inputs/shadow/A2 -I %S/Inputs/shadow %s -fsyntax-only 2>&1 | FileCheck %s -check-prefix=REDEFINITION
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/shadow/A1/module.modulemap -fmodule-map-file=%S/Inputs/shadow/A2/module.modulemap %S/Inputs/shadow %s -fsyntax-only 2>&1 | FileCheck %s -check-prefix=REDEFINITION
 // REDEFINITION: error: redefinition of module 'A'
 // REDEFINITION: note: previously defined
 
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/shadow/A1/module.modulemap -I %S/Inputs/shadow %s -verify
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -x objective-c-header %S/Inputs/shadow/A1/module.modulemap -emit-module -o %t/A.pcm -fmodule-name=A
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/shadow/A1/module.modulemap -fmodule-file=A=%t/A.pcm -I %S/Inputs/shadow %s -verify
 
-@import A1;
-@import A2;
+#import "A1/A1.h"
+#import "A2/A2.h"
 @import A;
 
 #import "A2/A.h" // expected-note {{implicitly imported}}
diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c
index 633ba468..b7765bf 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -304,11 +304,13 @@
 // RUN: %clang_cc1 %s -E -dM -o - -x hip --hipstdpar -triple x86_64-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-HIPSTDPAR
 // CHECK-HIPSTDPAR: #define __HIPSTDPAR__ 1
+// CHECK-HIPSTDPAR-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC_V1__ 1
 // CHECK-HIPSTDPAR-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
 
 // RUN: %clang_cc1 %s -E -dM -o - -x hip --hipstdpar --hipstdpar-interpose-alloc \
 // RUN:  -triple x86_64-unknown-linux-gnu | FileCheck -match-full-lines %s \
 // RUN:  --check-prefix=CHECK-HIPSTDPAR-INTERPOSE
+// CHECK-HIPSTDPAR-INTERPOSE: #define __HIPSTDPAR_INTERPOSE_ALLOC_V1__ 1
 // CHECK-HIPSTDPAR-INTERPOSE: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
 // CHECK-HIPSTDPAR-INTERPOSE: #define __HIPSTDPAR__ 1
 
@@ -316,4 +318,5 @@
 // RUN:  -triple amdgcn-amd-amdhsa -fcuda-is-device | FileCheck -match-full-lines \
 // RUN:  %s --check-prefix=CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG
 // CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG: #define __HIPSTDPAR__ 1
+// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC_V1__ 1
 // CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1
diff --git a/clang/test/Sema/implicit-cast.c b/clang/test/Sema/implicit-cast.c
index 088b195..4700b7d 100644
--- a/clang/test/Sema/implicit-cast.c
+++ b/clang/test/Sema/implicit-cast.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s
 
 static char *test1(int cf) {
   return cf ? "abc" : 0;
@@ -6,3 +6,8 @@ static char *test1(int cf) {
 static char *test2(int cf) {
   return cf ? 0 : "abc";
 }
+
+int baz(void) {
+  int f;
+  return ((void)0, f = 1.4f); // expected-warning {{implicit conversion from 'float' to 'int' changes value from 1.4 to 1}}
+}
diff --git a/clang/test/Sema/implicit-int-enum-conversion.c b/clang/test/Sema/implicit-int-enum-conversion.c
index 13afb5d..36717f3 100644
--- a/clang/test/Sema/implicit-int-enum-conversion.c
+++ b/clang/test/Sema/implicit-int-enum-conversion.c
@@ -50,3 +50,25 @@ enum E1 quux(void) {
   return E2_Zero;       // expected-warning {{implicit conversion from enumeration type 'enum E2' to different enumeration type 'enum E1'}} \
                            cxx-error {{cannot initialize return object of type 'enum E1' with an rvalue of type 'E2'}}
 }
+
+enum E1 comma1(void) {
+  return ((void)0, E1_One);
+}
+
+enum E1 comma2(void) {
+  enum E1 x;
+  return
+    (x = 12,  // expected-warning {{implicit conversion from 'int' to enumeration type 'enum E1' is invalid in C++}} \
+                 cxx-error {{assigning to 'enum E1' from incompatible type 'int'}}
+    E1_One);
+}
+
+enum E1 comma3(void) {
+  enum E1 x;
+  return ((void)0, foo()); // Okay, no conversion in C++
+}
+
+enum E1 comma4(void) {
+  return ((void)1, 2); // expected-warning {{implicit conversion from 'int' to enumeration type 'enum E1' is invalid in C++}} \
+                          cxx-error {{cannot initialize return object of type 'enum E1' with an rvalue of type 'int'}}
+}
diff --git a/clang/test/SemaCXX/MicrosoftCompatibility.cpp b/clang/test/SemaCXX/MicrosoftCompatibility.cpp
index a830883..b8cd22a 100644
--- a/clang/test/SemaCXX/MicrosoftCompatibility.cpp
+++ b/clang/test/SemaCXX/MicrosoftCompatibility.cpp
@@ -211,14 +211,14 @@ public:
    typedef B<U> Base2;
    typedef A<U> Base3;
 
-   A<T>::TYPE a1; // expected-warning {{implicit 'typename' is a C++20 extension}}
-   Base1::TYPE a2; // expected-warning {{implicit 'typename' is a C++20 extension}}
+   A<T>::TYPE a1; // expected-warning {{missing 'typename' prior to dependent type name 'A<T>::TYPE' is a C++20 extension}}
+   Base1::TYPE a2; // expected-warning {{missing 'typename' prior to dependent type name 'Base1::TYPE' is a C++20 extension}}
 
-   B<U>::TYPE a3; // expected-warning {{implicit 'typename' is a C++20 extension}}
-   Base2::TYPE a4; // expected-warning {{implicit 'typename' is a C++20 extension}}
+   B<U>::TYPE a3; // expected-warning {{missing 'typename' prior to dependent type name 'B<U>::TYPE' is a C++20 extension}}
+   Base2::TYPE a4; // expected-warning {{missing 'typename' prior to dependent type name 'Base2::TYPE' is a C++20 extension}}
 
-   A<U>::TYPE a5; // expected-warning {{implicit 'typename' is a C++20 extension}}
-   Base3::TYPE a6; // expected-warning {{implicit 'typename' is a C++20 extension}}
+   A<U>::TYPE a5; // expected-warning {{missing 'typename' prior to dependent type name 'A<U>::TYPE' is a C++20 extension}}
+   Base3::TYPE a6; // expected-warning {{missing 'typename' prior to dependent type name 'Base3::TYPE' is a C++20 extension}}
  };
 
 class D {
diff --git a/clang/test/SemaCXX/MicrosoftExtensions.cpp b/clang/test/SemaCXX/MicrosoftExtensions.cpp
index 7454a01..4dff2b1 100644
--- a/clang/test/SemaCXX/MicrosoftExtensions.cpp
+++ b/clang/test/SemaCXX/MicrosoftExtensions.cpp
@@ -613,7 +613,7 @@ typedef char __unaligned *aligned_type; // expected-error {{expected ';' after t
 
 namespace PR32750 {
 template<typename T> struct A {};
-template<typename T> struct B : A<A<T>> { A<T>::C::D d; }; // expected-warning {{implicit 'typename' is a C++20 extension}}
+template<typename T> struct B : A<A<T>> { A<T>::C::D d; }; // expected-warning {{missing 'typename' prior to dependent type name 'A<T>::C::D' is a C++20 extension}}
 }
 
 #endif
diff --git a/clang/test/SemaCXX/MicrosoftSuper.cpp b/clang/test/SemaCXX/MicrosoftSuper.cpp
index 94e29b2..d117b93 100644
--- a/clang/test/SemaCXX/MicrosoftSuper.cpp
+++ b/clang/test/SemaCXX/MicrosoftSuper.cpp
@@ -108,8 +108,8 @@ struct DerivedFromDependentBase : BaseTemplate<T> {
   typename __super::XXX a;
   typedef typename __super::XXX b;
 
-  __super::XXX c;         // expected-warning {{implicit 'typename' is a C++20 extension}}
-  typedef __super::XXX d; // expected-warning {{implicit 'typename' is a C++20 extension}}
+  __super::XXX c;         // expected-warning {{missing 'typename'}}
+  typedef __super::XXX d; // expected-warning {{missing 'typename'}}
 
   void foo() {
     typename __super::XXX e;
@@ -127,8 +127,8 @@ struct DerivedFromTemplateParameter : T {
   typename __super::XXX a;
   typedef typename __super::XXX b;
 
-  __super::XXX c;         // expected-warning {{implicit 'typename' is a C++20 extension}}
-  typedef __super::XXX d; // expected-warning {{implicit 'typename' is a C++20 extension}}
+  __super::XXX c;         // expected-warning {{missing 'typename'}}
+  typedef __super::XXX d; // expected-warning {{missing 'typename'}}
 
   void foo() {
     typename __super::XXX e;
diff --git a/clang/test/SemaCXX/attr-trivial-abi.cpp b/clang/test/SemaCXX/attr-trivial-abi.cpp
index e018ccd..333ab34 100644
--- a/clang/test/SemaCXX/attr-trivial-abi.cpp
+++ b/clang/test/SemaCXX/attr-trivial-abi.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple x86_64-windows-msvc -std=c++11
+
 
 void __attribute__((trivial_abi)) foo(); // expected-warning {{'trivial_abi' attribute only applies to classes}}
 
@@ -10,30 +12,38 @@ class __attribute__((trivial_abi)) a { a(a &&); };
 // (And it is only trivially relocatable, currently, if it is trivial for calls.)
 // In this case, it is suppressed by an explicitly defined move constructor.
 // Similar concerns apply to later tests that have #if defined(_WIN64) && !defined(__MINGW32__)
-static_assert(!__is_trivially_relocatable(a<int>), "");
+static_assert(!__is_trivially_relocatable(a<int>), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(a<int>), "");
 #else
-static_assert(__is_trivially_relocatable(a<int>), "");
+static_assert(__is_trivially_relocatable(a<int>), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(a<int>), "");
 #endif
 
 struct [[clang::trivial_abi]] S0 {
   int a;
 };
-static_assert(__is_trivially_relocatable(S0), "");
+static_assert(__is_trivially_relocatable(S0), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S0), "");
 
 struct __attribute__((trivial_abi)) S1 {
   int a;
 };
-static_assert(__is_trivially_relocatable(S1), "");
+static_assert(__is_trivially_relocatable(S1), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S1), "");
+
 
 struct __attribute__((trivial_abi)) S3 { // expected-warning {{'trivial_abi' cannot be applied to 'S3'}} expected-note {{is polymorphic}}
   virtual void m();
 };
-static_assert(!__is_trivially_relocatable(S3), "");
+static_assert(!__is_trivially_relocatable(S3), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S3), "");
+
 
 struct S3_2 {
   virtual void m();
 } __attribute__((trivial_abi)); // expected-warning {{'trivial_abi' cannot be applied to 'S3_2'}} expected-note {{is polymorphic}}
-static_assert(!__is_trivially_relocatable(S3_2), "");
+static_assert(!__is_trivially_relocatable(S3_2), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S3_2), "");
 
 struct __attribute__((trivial_abi)) S3_3 { // expected-warning {{'trivial_abi' cannot be applied to 'S3_3'}} expected-note {{has a field of a non-trivial class type}}
   S3_3(S3_3 &&);
@@ -43,9 +53,13 @@ struct __attribute__((trivial_abi)) S3_3 { // expected-warning {{'trivial_abi' c
 // The ClangABI4OrPS4 calling convention kind passes classes in registers if the
 // copy constructor is trivial for calls *or deleted*, while other platforms do
 // not accept deleted constructors.
-static_assert(__is_trivially_relocatable(S3_3), "");
+static_assert(__is_trivially_relocatable(S3_3), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S3_3), "");
+
 #else
-static_assert(!__is_trivially_relocatable(S3_3), "");
+static_assert(!__is_trivially_relocatable(S3_3), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S3_3), "");
+
 #endif
 
 // Diagnose invalid trivial_abi even when the type is templated because it has a non-trivial field.
@@ -54,20 +68,28 @@ struct __attribute__((trivial_abi)) S3_4 { // expected-warning {{'trivial_abi' c
   S3_4(S3_4 &&);
   S3_2 s32;
 };
-static_assert(!__is_trivially_relocatable(S3_4<int>), "");
+static_assert(!__is_trivially_relocatable(S3_4<int>), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S3_4<int>), "");
+
 
 struct S4 {
   int a;
 };
-static_assert(__is_trivially_relocatable(S4), "");
+static_assert(__is_trivially_relocatable(S4), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S4), "");
+
 
 struct __attribute__((trivial_abi)) S5 : public virtual S4 { // expected-warning {{'trivial_abi' cannot be applied to 'S5'}} expected-note {{has a virtual base}}
 };
-static_assert(!__is_trivially_relocatable(S5), "");
+static_assert(!__is_trivially_relocatable(S5), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S5), "");
+
 
 struct __attribute__((trivial_abi)) S9 : public S4 {
 };
-static_assert(__is_trivially_relocatable(S9), "");
+static_assert(__is_trivially_relocatable(S9), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S9), "");
+
 
 struct __attribute__((trivial_abi(1))) S8 { // expected-error {{'trivial_abi' attribute takes no arguments}}
   int a;
@@ -80,8 +102,12 @@ struct __attribute__((trivial_abi)) S10 {
 };
 
 S10<int *> p1;
-static_assert(__is_trivially_relocatable(S10<int>), "");
-static_assert(__is_trivially_relocatable(S10<S3>), "");
+static_assert(__is_trivially_relocatable(S10<int>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S10<int>), "");
+
+static_assert(__is_trivially_relocatable(S10<S3>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S10<S3>), "");
+
 
 template <class T>
 struct S14 {
@@ -93,15 +119,21 @@ struct __attribute__((trivial_abi)) S15 : S14<T> {
 };
 
 S15<int> s15;
-static_assert(__is_trivially_relocatable(S15<int>), "");
-static_assert(__is_trivially_relocatable(S15<S3>), "");
+static_assert(__is_trivially_relocatable(S15<int>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S15<int>), "");
+
+static_assert(__is_trivially_relocatable(S15<S3>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S15<S3>), "");
 
 template <class T>
 struct __attribute__((trivial_abi)) S16 {
   S14<T> a;
 };
-static_assert(__is_trivially_relocatable(S16<int>), "");
-static_assert(__is_trivially_relocatable(S16<S3>), "");
+static_assert(__is_trivially_relocatable(S16<int>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S16<int>), "");
+
+static_assert(__is_trivially_relocatable(S16<S3>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S16<S3>), "");
 
 S16<int> s16;
 
@@ -110,8 +142,12 @@ struct __attribute__((trivial_abi)) S17 {
 };
 
 S17<int> s17;
-static_assert(__is_trivially_relocatable(S17<int>), "");
-static_assert(__is_trivially_relocatable(S17<S3>), "");
+static_assert(__is_trivially_relocatable(S17<int>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S17<int>), "");
+
+static_assert(__is_trivially_relocatable(S17<S3>), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S17<S3>), "");
+
 
 namespace deletedCopyMoveConstructor {
 struct __attribute__((trivial_abi)) CopyMoveDeleted { // expected-warning {{'trivial_abi' cannot be applied to 'CopyMoveDeleted'}} expected-note {{copy constructors and move constructors are all deleted}}
@@ -119,18 +155,24 @@ struct __attribute__((trivial_abi)) CopyMoveDeleted { // expected-warning {{'tri
   CopyMoveDeleted(CopyMoveDeleted &&) = delete;
 };
 #ifdef __ORBIS__
-static_assert(__is_trivially_relocatable(CopyMoveDeleted), "");
+static_assert(__is_trivially_relocatable(CopyMoveDeleted), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(CopyMoveDeleted), "");
+
 #else
-static_assert(!__is_trivially_relocatable(CopyMoveDeleted), "");
+static_assert(!__is_trivially_relocatable(CopyMoveDeleted), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(CopyMoveDeleted), "");
+
 #endif
 
 struct __attribute__((trivial_abi)) S18 { // expected-warning {{'trivial_abi' cannot be applied to 'S18'}} expected-note {{copy constructors and move constructors are all deleted}}
   CopyMoveDeleted a;
 };
 #ifdef __ORBIS__
-static_assert(__is_trivially_relocatable(S18), "");
+static_assert(__is_trivially_relocatable(S18), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S18), "");
 #else
-static_assert(!__is_trivially_relocatable(S18), "");
+static_assert(!__is_trivially_relocatable(S18), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S18), "");
 #endif
 
 struct __attribute__((trivial_abi)) CopyDeleted {
@@ -138,25 +180,29 @@ struct __attribute__((trivial_abi)) CopyDeleted {
   CopyDeleted(CopyDeleted &&) = default;
 };
 #if defined(_WIN64) && !defined(__MINGW32__)
-static_assert(!__is_trivially_relocatable(CopyDeleted), "");
+static_assert(!__is_trivially_relocatable(CopyDeleted), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(CopyDeleted), "");
+
 #else
-static_assert(__is_trivially_relocatable(CopyDeleted), "");
+static_assert(__is_trivially_relocatable(CopyDeleted), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(CopyDeleted), "");
 #endif
 
 struct __attribute__((trivial_abi)) MoveDeleted {
   MoveDeleted(const MoveDeleted &) = default;
   MoveDeleted(MoveDeleted &&) = delete;
 };
-static_assert(__is_trivially_relocatable(MoveDeleted), "");
-
+static_assert(__is_trivially_relocatable(MoveDeleted), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(MoveDeleted), "");
 struct __attribute__((trivial_abi)) S19 { // expected-warning {{'trivial_abi' cannot be applied to 'S19'}} expected-note {{copy constructors and move constructors are all deleted}}
   CopyDeleted a;
   MoveDeleted b;
 };
 #ifdef __ORBIS__
-static_assert(__is_trivially_relocatable(S19), "");
-#else
-static_assert(!__is_trivially_relocatable(S19), "");
+static_assert(__is_trivially_relocatable(S19), ""); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S19), "");
+static_assert(!__is_trivially_relocatable(S19), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S19), "");
 #endif
 
 // This is fine since the move constructor isn't deleted.
@@ -164,8 +210,12 @@ struct __attribute__((trivial_abi)) S20 {
   int &&a; // a member of rvalue reference type deletes the copy constructor.
 };
 #if defined(_WIN64) && !defined(__MINGW32__)
-static_assert(!__is_trivially_relocatable(S20), "");
+static_assert(!__is_trivially_relocatable(S20), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S20), "");
+
 #else
-static_assert(__is_trivially_relocatable(S20), "");
+static_assert(__is_trivially_relocatable(S20), ""); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S20), "");
+
 #endif
 } // namespace deletedCopyMoveConstructor
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
index c38f888..1efed72 100644
--- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -48,3 +48,15 @@ concept is_foo_concept = __is_same(foo::bar, T);
 // expected-error@-1 {{'bar' is a private member of 'GH131530::foo'}}
 
 }
+
+namespace GH138820 {
+int a;
+template<typename T>
+concept atomicish = requires() {
+  {    // expected-note {{to match this '{'}}
+    a
+   ... // expected-error {{expected '}'}}
+  };
+};
+atomicish<int> f(); // expected-error {{expected 'auto' or 'decltype(auto)' after concept name}}
+} // namespace GH138820
diff --git a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
index eafadb0..57a48fa 100644
--- a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
+++ b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
@@ -412,6 +412,26 @@ namespace dependent_static_var_template {
   }
 
   int cf() { return F<int>(); }
+
+#ifdef CPP1Y
+  namespace GH55872 {
+    struct s {
+      template<typename T>
+      static CONST auto f = [] { return T::template g<s>; };
+      // expected-note@-1 {{in instantiation of static data member 'dependent_static_var_template::GH55872::t::g' requested here}}
+      // expected-note@-2 {{while substituting into a lambda expression here}}
+    };
+
+    struct t {
+      template<typename T>
+      static CONST auto g = [] { return T::template f<t>; };
+      // expected-error@-1 {{the type of variable template specialization 'f<dependent_static_var_template::GH55872::t>' declared with deduced type 'const auto' depends on itself}}
+      // expected-note@-2 {{while substituting into a lambda expression here}}
+    };
+
+    void test() { s::f<t>()(); } // expected-note {{in instantiation of static data member 'dependent_static_var_template::GH55872::s::f' requested here}}
+  }
+#endif
 }
 
 #ifndef PRECXX11
diff --git a/clang/test/SemaCXX/cxx1y-variable-templates_top_level.cpp b/clang/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
index 6fc2032..1fe0ce9 100644
--- a/clang/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
+++ b/clang/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
@@ -492,4 +492,21 @@ static_assert(C<int, 0,1,2,3,4>::VALUEARRAY[3] == 3, "");
 static_assert(C<int, 0,1,2,3,4>::VALUEARRAY[0] == 0, "");
 
 }
+
+namespace appear_in_its_own_init {
+template <class T>
+auto GH51347 = GH51347<T>; // expected-error {{variable template 'GH51347' declared with deduced type 'auto' cannot appear in its own initializer}}
+
+template <class T, class... Ts>
+auto a = [] {
+  using U = T;
+  a<U, Ts...>; // expected-error {{variable template 'a' declared with deduced type 'auto' cannot appear in its own initializer}}
+};
+
+template <int...> int b;
+template <int I>
+auto b<I, I * 2, 5> = b<I, I * 2, 5l>; // expected-error {{variable template partial specialization 'b<I, I * 2, 5>' declared with deduced type 'auto' cannot appear in its own initializer}}
+template <> auto b<0, 0, 0> = b<0, 0, 0>; // expected-error {{variable template explicit specialization 'b<0, 0, 0>' declared with deduced type 'auto' cannot appear in its own initializer}}
+}
+
 #endif
diff --git a/clang/test/SemaCXX/gh138775.cpp b/clang/test/SemaCXX/gh138775.cpp
new file mode 100644
index 0000000..854e25f
--- /dev/null
+++ b/clang/test/SemaCXX/gh138775.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify=cxx17 %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify=pre-cxx20-compat -Wpre-c++20-compat %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify=cxx20-compat -Wc++20-compat %s
+// cxx20-compat-no-diagnostics
+
+// cxx17-error@+4 {{unknown type name 'consteval'; did you mean 'constexpr'}}
+// cxx17-warning@+3 {{missing 'typename' prior to dependent type name 'T::type' is a C++20 extension}}
+// pre-cxx20-compat-warning@+2 {{'consteval' specifier is incompatible with C++ standards before C++20}}
+// pre-cxx20-compat-warning@+1 {{missing 'typename' prior to dependent type name 'T::type' is incompatible with C++ standards before C++20}}
+template<typename T> consteval T::type f();
+
+// cxx17-error@+2 {{unknown type name 'constinit'}}
+// pre-cxx20-compat-warning@+1 {{'constinit' specifier is incompatible with C++ standards before C++20}}
+constinit int x = 4;
diff --git a/clang/test/SemaCXX/ptrauth-triviality.cpp b/clang/test/SemaCXX/ptrauth-triviality.cpp
index ce6e1a7..785e83a 100644
--- a/clang/test/SemaCXX/ptrauth-triviality.cpp
+++ b/clang/test/SemaCXX/ptrauth-triviality.cpp
@@ -1,6 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
-// expected-no-diagnostics
 
 #define AQ __ptrauth(1,1,50)
 #define IQ __ptrauth(1,0,50)
@@ -24,7 +23,8 @@ static_assert(!__is_trivially_constructible(S1, const S1&));
 static_assert(!__is_trivially_assignable(S1, const S1&));
 static_assert(__is_trivially_destructible(S1));
 static_assert(!__is_trivially_copyable(S1));
-static_assert(!__is_trivially_relocatable(S1));
+static_assert(!__is_trivially_relocatable(S1)); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(S1));
 static_assert(!__is_trivially_equality_comparable(S1));
 
 static_assert(__is_trivially_constructible(Holder<S1>));
@@ -32,7 +32,8 @@ static_assert(!__is_trivially_constructible(Holder<S1>, const Holder<S1>&));
 static_assert(!__is_trivially_assignable(Holder<S1>, const Holder<S1>&));
 static_assert(__is_trivially_destructible(Holder<S1>));
 static_assert(!__is_trivially_copyable(Holder<S1>));
-static_assert(!__is_trivially_relocatable(Holder<S1>));
+static_assert(!__is_trivially_relocatable(Holder<S1>)); // expected-warning{{deprecated}}
+static_assert(!__builtin_is_cpp_trivially_relocatable(Holder<S1>));
 static_assert(!__is_trivially_equality_comparable(Holder<S1>));
 
 struct S2 {
@@ -45,7 +46,8 @@ static_assert(__is_trivially_constructible(S2, const S2&));
 static_assert(__is_trivially_assignable(S2, const S2&));
 static_assert(__is_trivially_destructible(S2));
 static_assert(__is_trivially_copyable(S2));
-static_assert(__is_trivially_relocatable(S2));
+static_assert(__is_trivially_relocatable(S2)); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(S2));
 static_assert(__is_trivially_equality_comparable(S2));
 
 static_assert(__is_trivially_constructible(Holder<S2>));
@@ -53,7 +55,8 @@ static_assert(__is_trivially_constructible(Holder<S2>, const Holder<S2>&));
 static_assert(__is_trivially_assignable(Holder<S2>, const Holder<S2>&));
 static_assert(__is_trivially_destructible(Holder<S2>));
 static_assert(__is_trivially_copyable(Holder<S2>));
-static_assert(__is_trivially_relocatable(Holder<S2>));
+static_assert(__is_trivially_relocatable(Holder<S2>)); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S2>));
 static_assert(__is_trivially_equality_comparable(Holder<S2>));
 
 struct AA S3 {
@@ -67,15 +70,19 @@ static_assert(!__is_trivially_constructible(S3, const S3&));
 static_assert(!__is_trivially_assignable(S3, const S3&));
 static_assert(__is_trivially_destructible(S3));
 static_assert(!__is_trivially_copyable(S3));
-static_assert(!__is_trivially_relocatable(S3));
+static_assert(!__is_trivially_relocatable(S3)); // expected-warning{{deprecated}}
+//FIXME
+static_assert(__builtin_is_cpp_trivially_relocatable(S3));
 static_assert(!__is_trivially_equality_comparable(S3));
 
+
 static_assert(!__is_trivially_constructible(Holder<S3>));
 static_assert(!__is_trivially_constructible(Holder<S3>, const Holder<S3>&));
 static_assert(!__is_trivially_assignable(Holder<S3>, const Holder<S3>&));
 static_assert(__is_trivially_destructible(Holder<S3>));
 static_assert(!__is_trivially_copyable(Holder<S3>));
-static_assert(__is_trivially_relocatable(Holder<S3>));
+static_assert(__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S3>));
 static_assert(!__is_trivially_equality_comparable(Holder<S3>));
 
 struct IA S4 {
@@ -89,7 +96,9 @@ static_assert(!__is_trivially_constructible(S4, const S4&));
 static_assert(!__is_trivially_assignable(S4, const S4&));
 static_assert(__is_trivially_destructible(S4));
 static_assert(!__is_trivially_copyable(S4));
-static_assert(!__is_trivially_relocatable(S4));
+static_assert(!__is_trivially_relocatable(S4)); // expected-warning{{deprecated}}
+//FIXME
+static_assert(__builtin_is_cpp_trivially_relocatable(S4));
 static_assert(!__is_trivially_equality_comparable(S4));
 
 static_assert(!__is_trivially_constructible(Holder<S4>));
@@ -97,7 +106,8 @@ static_assert(!__is_trivially_constructible(Holder<S4>, const Holder<S4>&));
 static_assert(!__is_trivially_assignable(Holder<S4>, const Holder<S4>&));
 static_assert(__is_trivially_destructible(Holder<S4>));
 static_assert(!__is_trivially_copyable(Holder<S4>));
-static_assert(__is_trivially_relocatable(Holder<S4>));
+static_assert(__is_trivially_relocatable(Holder<S4>)); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S4>));
 static_assert(!__is_trivially_equality_comparable(Holder<S4>));
 
 struct PA S5 {
@@ -111,7 +121,9 @@ static_assert(!__is_trivially_constructible(S5, const S5&));
 static_assert(!__is_trivially_assignable(S5, const S5&));
 static_assert(__is_trivially_destructible(S5));
 static_assert(!__is_trivially_copyable(S5));
-static_assert(!__is_trivially_relocatable(S5));
+static_assert(!__is_trivially_relocatable(S5)); // expected-warning{{deprecated}}
+//FIXME
+static_assert(__builtin_is_cpp_trivially_relocatable(S5));
 static_assert(!__is_trivially_equality_comparable(S5));
 
 static_assert(!__is_trivially_constructible(Holder<S5>));
@@ -119,5 +131,6 @@ static_assert(!__is_trivially_constructible(Holder<S5>, const Holder<S5>&));
 static_assert(!__is_trivially_assignable(Holder<S5>, const Holder<S5>&));
 static_assert(__is_trivially_destructible(Holder<S5>));
 static_assert(!__is_trivially_copyable(Holder<S5>));
-static_assert(__is_trivially_relocatable(Holder<S5>));
+static_assert(__is_trivially_relocatable(Holder<S5>)); // expected-warning{{deprecated}}
+static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S5>));
 static_assert(!__is_trivially_equality_comparable(Holder<S5>));
diff --git a/clang/test/SemaCXX/rounding-math-crash.cpp b/clang/test/SemaCXX/rounding-math-crash.cpp
index 2a09b02..f9c5ada 100644
--- a/clang/test/SemaCXX/rounding-math-crash.cpp
+++ b/clang/test/SemaCXX/rounding-math-crash.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -frounding-math -verify %s
 
 template <class b> b::a() {}
-// expected-warning@-1 {{implicit 'typename' is a C++20 extension}}
+// expected-warning@-1 {{missing 'typename' prior to dependent type name 'b::a' is a C++20 extension}}
 // expected-error@-2 {{expected unqualified-id}}
diff --git a/clang/test/SemaCXX/type-traits-nonobject.cpp b/clang/test/SemaCXX/type-traits-nonobject.cpp
index 5f7c20c..1763d73 100644
--- a/clang/test/SemaCXX/type-traits-nonobject.cpp
+++ b/clang/test/SemaCXX/type-traits-nonobject.cpp
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
 
-// expected-no-diagnostics
-
 static_assert(!__is_pod(void), "");
 static_assert(!__is_pod(int&), "");
 static_assert(!__is_pod(int()), "");
@@ -13,7 +11,13 @@ static_assert(!__is_trivially_copyable(int&), "");
 static_assert(!__is_trivially_copyable(int()), "");
 static_assert(!__is_trivially_copyable(int()&), "");
 
-static_assert(!__is_trivially_relocatable(void), "");
-static_assert(!__is_trivially_relocatable(int&), "");
-static_assert(!__is_trivially_relocatable(int()), "");
-static_assert(!__is_trivially_relocatable(int()&), "");
+static_assert(!__is_trivially_relocatable(void), ""); // expected-warning{{deprecated}}
+static_assert(!__is_trivially_relocatable(int&), ""); // expected-warning{{deprecated}}
+static_assert(!__is_trivially_relocatable(int()), ""); // expected-warning{{deprecated}}
+static_assert(!__is_trivially_relocatable(int()&), ""); // expected-warning{{deprecated}}
+
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(void), "");
+static_assert(!__builtin_is_cpp_trivially_relocatable(int&), "");
+static_assert(!__builtin_is_cpp_trivially_relocatable(int()), "");
+static_assert(!__builtin_is_cpp_trivially_relocatable(int()&), "");
diff --git a/clang/test/SemaCXX/unknown-type-name.cpp b/clang/test/SemaCXX/unknown-type-name.cpp
index 602f8f9..9ce8b69 100644
--- a/clang/test/SemaCXX/unknown-type-name.cpp
+++ b/clang/test/SemaCXX/unknown-type-name.cpp
@@ -36,15 +36,15 @@ struct A {
 
   static int n;
   static type m;
-  static int h(T::type, int); // expected-warning{{implicit 'typename' is a C++20 extension}}
-  static int h(T::type x, char); // expected-warning{{implicit 'typename' is a C++20 extension}}
+  static int h(T::type, int); // expected-warning{{missing 'typename'}}
+  static int h(T::type x, char); // expected-warning{{missing 'typename'}}
 };
 
 template<typename T>
-A<T>::type g(T t) { return t; } // expected-warning{{implicit 'typename' is a C++20 extension}}
+A<T>::type g(T t) { return t; } // expected-warning{{missing 'typename'}}
 
 template<typename T>
-A<T>::type A<T>::f() { return type(); } // expected-warning{{implicit 'typename' is a C++20 extension}}
+A<T>::type A<T>::f() { return type(); } // expected-warning{{missing 'typename'}}
 
 template<typename T>
 void f(T::type) { } // expected-error{{missing 'typename'}}
@@ -84,11 +84,11 @@ int *test(UnknownType *fool) { return 0; } // expected-error{{unknown type name
 
 template<typename T> int A<T>::n(T::value); // ok
 template<typename T>
-A<T>::type // expected-warning {{implicit 'typename' is a C++20 extension}}
+A<T>::type // expected-warning {{missing 'typename'}}
 A<T>::m(T::value, 0); // ok
 
-template<typename T> int A<T>::h(T::type, int) {} // expected-warning{{implicit 'typename' is a C++20 extension}}
-template<typename T> int A<T>::h(T::type x, char) {} // expected-warning{{implicit 'typename' is a C++20 extension}}
+template<typename T> int A<T>::h(T::type, int) {} // expected-warning{{missing 'typename'}}
+template<typename T> int A<T>::h(T::type x, char) {} // expected-warning{{missing 'typename'}}
 
 template<typename T> int h(T::type, int); // expected-error{{missing 'typename'}}
 template<typename T> int h(T::type x, char); // expected-error{{missing 'typename'}}
@@ -117,4 +117,4 @@ template<typename T> int i(T::type, int());
 //        a fix-it to add 'typename A<T>::type'
 template<typename T>
 A<T>::g() { } // expected-error{{expected unqualified-id}}
-// expected-warning@-1{{implicit 'typename' is a C++20 extension}}
+// expected-warning@-1{{missing 'typename'}}
diff --git a/clang/test/SemaTemplate/instantiate-var-template.cpp b/clang/test/SemaTemplate/instantiate-var-template.cpp
index 60d3bd3..50b7219 100644
--- a/clang/test/SemaTemplate/instantiate-var-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-var-template.cpp
@@ -47,3 +47,14 @@ namespace InvalidInsertPos {
   template<> int v<int, 0>;
   int k = v<int, 500>;
 }
+
+namespace GH97881_comment {
+  template <bool B>
+  auto g = sizeof(g<!B>);
+  // expected-error@-1 {{the type of variable template specialization 'g<false>'}}
+  // expected-note@-2 {{in instantiation of variable template specialization 'GH97881_comment::g'}}
+
+  void test() {
+    (void)sizeof(g<false>); // expected-note {{in instantiation of variable template specialization 'GH97881_comment::g'}}
+  }
+}
diff --git a/clang/test/SemaTemplate/typename-specifier-3.cpp b/clang/test/SemaTemplate/typename-specifier-3.cpp
index cdd065c..6e09012 100644
--- a/clang/test/SemaTemplate/typename-specifier-3.cpp
+++ b/clang/test/SemaTemplate/typename-specifier-3.cpp
@@ -28,7 +28,7 @@ namespace PR12884_original {
       typedef int arg;
     };
     struct C {
-      typedef B::X<typename B::arg> x; // precxx17-warning{{missing 'typename' prior to dependent type name 'B::X'; implicit 'typename' is a C++20 extension}}
+      typedef B::X<typename B::arg> x; // precxx17-warning{{missing 'typename' prior to dependent type name 'B::X' is a C++20 extension}}
     };
   };
 
diff --git a/clang/tools/cir-opt/cir-opt.cpp b/clang/tools/cir-opt/cir-opt.cpp
index e50fa70..0e20b97 100644
--- a/clang/tools/cir-opt/cir-opt.cpp
+++ b/clang/tools/cir-opt/cir-opt.cpp
@@ -37,6 +37,9 @@ int main(int argc, char **argv) {
   ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
     return mlir::createCIRCanonicalizePass();
   });
+  ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
+    return mlir::createCIRSimplifyPass();
+  });
 
   mlir::PassPipelineRegistration<CIRToLLVMPipelineOptions> pipeline(
       "cir-to-llvm", "",
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 0fb64ce..bcb2b6f 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -754,6 +754,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) {
   ASSERT_EQ(Tokens.size(), 27u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::less, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[20], tok::greater, TT_BinaryOperator);
+
+  Tokens = annotate("bool foo = a < b && (c * d) > e;");
+  ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::less, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[6], tok::ampamp, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[9], tok::star, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[12], tok::greater, TT_BinaryOperator);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsTemplateTemplateParameters) {
@@ -3106,6 +3113,19 @@ TEST_F(TokenAnnotatorTest, CSharpNullableTypes) {
   ASSERT_EQ(Tokens.size(), 4u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_CSharpNullable);
 
+  Tokens = annotate("{\n"
+                    "  int? a;\n"
+                    "  if (b is int?)\n"
+                    "    f();\n"
+                    "  var foo = A<Foo?>();\n"
+                    "}",
+                    Style);
+  ASSERT_EQ(Tokens.size(), 29u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::question, TT_CSharpNullable);
+  EXPECT_TOKEN(Tokens[10], tok::question, TT_CSharpNullable);
+  EXPECT_TOKEN(Tokens[20], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[22], tok::question, TT_CSharpNullable);
+
   Tokens = annotate("cond? id : id2", Style);
   ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_ConditionalExpr);
@@ -4033,6 +4053,12 @@ TEST_F(TokenAnnotatorTest, UserDefinedLiteral) {
   EXPECT_EQ(Tokens[3]->TokenText, "2_$");
 }
 
+TEST_F(TokenAnnotatorTest, EnumColonInTypedef) {
+  auto Tokens = annotate("typedef enum : int {} foo;");
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::colon, TT_Unknown); // Not TT_InheritanceColon.
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/compiler-rt/lib/lsan/lsan_fuchsia.cpp b/compiler-rt/lib/lsan/lsan_fuchsia.cpp
index ba59bc9..bb5de89 100644
--- a/compiler-rt/lib/lsan/lsan_fuchsia.cpp
+++ b/compiler-rt/lib/lsan/lsan_fuchsia.cpp
@@ -21,6 +21,11 @@
 
 using namespace __lsan;
 
+namespace __sanitizer {
+// LSan doesn't need to do anything else special in the startup hook.
+void EarlySanitizerInit() {}
+}  // namespace __sanitizer
+
 namespace __lsan {
 
 void LsanOnDeadlySignal(int signo, void *siginfo, void *context) {}
diff --git a/compiler-rt/test/ubsan_minimal/TestCases/override-callback.c b/compiler-rt/test/ubsan_minimal/TestCases/override-callback.c
index 8c3b4db..9d326ff 100644
--- a/compiler-rt/test/ubsan_minimal/TestCases/override-callback.c
+++ b/compiler-rt/test/ubsan_minimal/TestCases/override-callback.c
@@ -1,5 +1,6 @@
-// RUN: %clang -fsanitize=implicit-integer-sign-change                           %s -o %t &&             %run %t 2>&1 | FileCheck %s
-// RUN: %clang -fsanitize=implicit-integer-sign-change -fno-sanitize-recover=all %s -o %t && not --crash %run %t 2>&1 | FileCheck %s --check-prefixes=FATAL
+// RUN: %clang -fsanitize=implicit-integer-sign-change                                        %s -o %t &&             %run %t 2>&1 | FileCheck %s
+// RUN: %clang -fsanitize=implicit-integer-sign-change -fno-sanitize-recover=all              %s -o %t && not --crash %run %t 2>&1 | FileCheck %s
+// RUN: %clang -fsanitize=implicit-integer-sign-change -fno-sanitize-recover=all -DOVERRIDE=1 %s -o %t && not --crash %run %t 2>&1 | FileCheck %s --check-prefixes=FATAL
 
 #include <stdint.h>
 #include <stdio.h>
@@ -11,9 +12,11 @@ void __ubsan_report_error(const char *kind, uintptr_t caller) {
   fprintf(stderr, "CUSTOM_CALLBACK: %s\n", kind);
 }
 
+#if OVERRIDE
 void __ubsan_report_error_fatal(const char *kind, uintptr_t caller) {
   fprintf(stderr, "FATAL_CALLBACK: %s\n", kind);
 }
+#endif
 
 int main(int argc, const char **argv) {
   int32_t t0 = (~((uint32_t)0));
diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md
index 2d4b9dd..5877238 100644
--- a/flang/docs/OpenMPSupport.md
+++ b/flang/docs/OpenMPSupport.md
@@ -64,4 +64,4 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
 | target teams distribute parallel loop simd construct       | P      | device, reduction, dist_schedule and linear clauses are not supported |
 
 ## OpenMP 3.1, OpenMP 2.5, OpenMP 1.1
-All features except a few corner cases in atomic (complex type, different but compatible types in lhs and rhs), threadprivate (character type) constructs/clauses are supported.
+All features except a few corner cases in atomic (complex type, different but compatible types in lhs and rhs) are supported.
diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index 8288b81..e05625a 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -134,10 +134,11 @@ mlir::Value genInitialDataTarget(Fortran::lower::AbstractConverter &,
                                  const SomeExpr &initialTarget,
                                  bool couldBeInEquivalence = false);
 
-/// Call \p genInit to generate code inside \p global initializer region.
-void createGlobalInitialization(
-    fir::FirOpBuilder &builder, fir::GlobalOp global,
-    std::function<void(fir::FirOpBuilder &)> genInit);
+/// Create the global op and its init if it has one
+fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
+                           const Fortran::lower::pft::Variable &var,
+                           llvm::StringRef globalName, mlir::StringAttr linkage,
+                           cuf::DataAttributeAttr dataAttr = {});
 
 /// Generate address \p addr inside an initializer.
 fir::ExtendedValue
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 3ebc2495..2845080 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -200,4 +200,23 @@ def fir_OpenMPSafeTempArrayCopyAttr : fir_Attr<"OpenMPSafeTempArrayCopy"> {
   }];
 }
 
+def LocalitySpecTypeLocal : I32EnumAttrCase<"Local", 0, "local">;
+def LocalitySpecTypeLocalInit
+    : I32EnumAttrCase<"LocalInit", 1, "local_init">;
+
+def LocalitySpecifierType : I32EnumAttr<
+    "LocalitySpecifierType",
+    "Type of a locality specifier", [
+      LocalitySpecTypeLocal,
+      LocalitySpecTypeLocalInit
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::fir";
+}
+
+def LocalitySpecifierTypeAttr : EnumAttr<FIROpsDialect, LocalitySpecifierType,
+                                                "locality_specifier_type"> {
+  let assemblyFormat = "`{` `type` `=` $value `}`";
+}
+
 #endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 0ba9856..acc0c69 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3485,6 +3485,137 @@ def fir_BoxTotalElementsOp
   let hasCanonicalizer = 1;
 }
 
+def YieldOp : fir_Op<"yield",
+    [Pure, ReturnLike, Terminator,
+     ParentOneOf<["LocalitySpecifierOp"]>]> {
+  let summary = "loop yield and termination operation";
+  let description = [{
+    "fir.yield" yields SSA values from a fir dialect op region and
+    terminates the region. The semantics of how the values are yielded is
+    defined by the parent operation.
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$results);
+
+  let builders = [
+    OpBuilder<(ins), [{ build($_builder, $_state, {}); }]>
+  ];
+
+  let assemblyFormat = "( `(` $results^ `:` type($results) `)` )? attr-dict";
+}
+
+def fir_LocalitySpecifierOp : fir_Op<"local", [IsolatedFromAbove]> {
+  let summary = "Provides declaration of local and local_init logic.";
+  let description = [{
+    This operation provides a declaration of how to implement the
+    localization of a variable. The dialect users should provide
+    which type should be allocated for this variable. The allocated (usually by
+    alloca) variable is passed to the initialization region which does everything
+    else (e.g. initialization of Fortran runtime descriptors). Information about
+    how to initialize the copy from the original item should be given in the
+    copy region, and if needed, how to deallocate memory (allocated by the
+    initialization region) in the dealloc region.
+
+    Examples:
+
+    * `local(x)` would not need any regions because no initialization is
+      required by the standard for i32 variables and this is not local_init.
+    ```
+    fir.local {type = local} @x.localizer : i32
+    ```
+
+    * `local_init(x)` would be emitted as:
+    ```
+    fir.local {type = local_init} @x.localizer : i32 copy {
+    ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+    // %arg0 is the original host variable.
+    // %arg1 represents the memory allocated for this private variable.
+    ... copy from host to the localized clone ....
+    fir.yield(%arg1 : !fir.ref<i32>)
+    }
+    ```
+
+    * `local(x)` for "allocatables" would be emitted as:
+    ```
+    fir.local {type = local} @x.localizer : !some.type init {
+    ^bb0(%arg0: !fir.ref<!some.type>, %arg1: !fir.ref<!some.type>):
+    // initialize %arg1, using %arg0 as a mold for allocations.
+    // For example if %arg0 is a heap allocated array with a runtime determined
+    // length and !some.type is a runtime type descriptor, the init region
+    // will read the array length from %arg0, and heap allocate an array of the
+    // right length and initialize %arg1 to contain the array allocation and
+    // length.
+    fir.yield(%arg1 : !fir.ref<!some.type>)
+    } dealloc {
+    ^bb0(%arg0: !fir.ref<!some.type>):
+    // ... deallocate memory allocated by the init region...
+    // In the example above, this will free the heap allocated array data.
+    fir.yield
+    }
+    ```
+
+    There are no restrictions on the body except for:
+    - The `dealloc` regions has a single argument.
+    - The `init` & `copy` regions have 2 arguments.
+    - All three regions are terminated by `fir.yield` ops.
+    The above restrictions and other obvious restrictions (e.g. verifying the
+    type of yielded values) are verified by the custom op verifier. The actual
+    contents of the blocks inside all regions are not verified.
+
+    Instances of this op would then be used by ops that model directives that
+    accept data-sharing attribute clauses.
+
+    The `sym_name` attribute provides a symbol by which the privatizer op can be
+    referenced by other dialect ops.
+
+    The `type` attribute is the type of the value being localized. This type
+    will be implicitly allocated in MLIR->LLVMIR conversion and passed as the
+    second argument to the init region. Therefore the type of arguments to
+    the regions should be a type which represents a pointer to `type`.
+
+    The `locality_specifier_type` attribute specifies whether the localized
+    corresponds to a `local` or a `local_init` specifier.
+  }];
+
+  let arguments = (ins SymbolNameAttr:$sym_name,
+                       TypeAttrOf<AnyType>:$type,
+                       LocalitySpecifierTypeAttr:$locality_specifier_type);
+
+  let regions = (region AnyRegion:$init_region,
+                        AnyRegion:$copy_region,
+                        AnyRegion:$dealloc_region);
+
+  let assemblyFormat = [{
+    $locality_specifier_type $sym_name `:` $type
+      (`init` $init_region^)?
+      (`copy` $copy_region^)?
+      (`dealloc` $dealloc_region^)?
+      attr-dict
+  }];
+
+  let builders = [
+    OpBuilder<(ins CArg<"mlir::TypeRange">:$result,
+                   CArg<"mlir::StringAttr">:$sym_name,
+                   CArg<"mlir::TypeAttr">:$type)>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Get the type for arguments to nested regions. This should
+    /// generally be either the same as getType() or some pointer
+    /// type (pointing to the type allocated by this op).
+    /// This method will return Type{nullptr} if there are no nested
+    /// regions.
+    mlir::Type getArgType() {
+      for (mlir::Region *region : getRegions())
+        for (mlir::Type ty : region->getArgumentTypes())
+          return ty;
+      return nullptr;
+    }
+  }];
+
+  let hasRegionVerifier = 1;
+}
+
 def fir_DoConcurrentOp : fir_Op<"do_concurrent",
     [SingleBlock, AutomaticAllocationScope]> {
   let summary = "do concurrent loop wrapper";
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index c0cf90c..a3721bc 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -483,6 +483,11 @@ public:
   NODE(parser, OldParameterStmt)
   NODE(parser, OmpTypeSpecifier)
   NODE(parser, OmpTypeNameList)
+  NODE(parser, OmpAdjustArgsClause)
+  NODE(OmpAdjustArgsClause, OmpAdjustOp)
+  NODE_ENUM(OmpAdjustArgsClause::OmpAdjustOp, Value)
+  NODE(parser, OmpAppendArgsClause)
+  NODE(OmpAppendArgsClause, OmpAppendOp)
   NODE(parser, OmpLocator)
   NODE(parser, OmpLocatorList)
   NODE(parser, OmpReductionSpecifier)
@@ -703,6 +708,7 @@ public:
   NODE(parser, OpenMPCriticalConstruct)
   NODE(parser, OpenMPDeclarativeAllocate)
   NODE(parser, OpenMPDeclarativeConstruct)
+  NODE(parser, OmpDeclareVariantDirective)
   NODE(parser, OpenMPDeclareReductionConstruct)
   NODE(parser, OpenMPDeclareSimdConstruct)
   NODE(parser, OpenMPDeclareTargetConstruct)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index e39ecc1..a0d7a79 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4013,6 +4013,15 @@ struct OmpAbsentClause {
   WRAPPER_CLASS_BOILERPLATE(OmpAbsentClause, OmpDirectiveList);
 };
 
+struct OmpAdjustArgsClause {
+  TUPLE_CLASS_BOILERPLATE(OmpAdjustArgsClause);
+  struct OmpAdjustOp {
+    ENUM_CLASS(Value, Nothing, Need_Device_Ptr)
+    WRAPPER_CLASS_BOILERPLATE(OmpAdjustOp, Value);
+  };
+  std::tuple<OmpAdjustOp, OmpObjectList> t;
+};
+
 // Ref: [5.0:135-140], [5.1:161-166], [5.2:264-265]
 //
 // affinity-clause ->
@@ -4056,6 +4065,13 @@ struct OmpAllocateClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
+struct OmpAppendArgsClause {
+  struct OmpAppendOp {
+    WRAPPER_CLASS_BOILERPLATE(OmpAppendOp, std::list<OmpInteropType>);
+  };
+  WRAPPER_CLASS_BOILERPLATE(OmpAppendArgsClause, std::list<OmpAppendOp>);
+};
+
 // Ref: [5.2:216-217 (sort of, as it's only mentioned in passing)
 // AT(compilation|execution)
 struct OmpAtClause {
@@ -4698,6 +4714,12 @@ struct OmpBlockDirective {
   CharBlock source;
 };
 
+struct OmpDeclareVariantDirective {
+  TUPLE_CLASS_BOILERPLATE(OmpDeclareVariantDirective);
+  CharBlock source;
+  std::tuple<Verbatim, std::optional<Name>, Name, OmpClauseList> t;
+};
+
 // 2.10.6 declare-target -> DECLARE TARGET (extended-list) |
 //                          DECLARE TARGET [declare-target-clause[ [,]
 //                                          declare-target-clause]...]
@@ -4776,8 +4798,8 @@ struct OpenMPDeclarativeConstruct {
   std::variant<OpenMPDeclarativeAllocate, OpenMPDeclarativeAssumes,
       OpenMPDeclareMapperConstruct, OpenMPDeclareReductionConstruct,
       OpenMPDeclareSimdConstruct, OpenMPDeclareTargetConstruct,
-      OpenMPThreadprivate, OpenMPRequiresConstruct, OpenMPUtilityConstruct,
-      OmpMetadirectiveDirective>
+      OmpDeclareVariantDirective, OpenMPThreadprivate, OpenMPRequiresConstruct,
+      OpenMPUtilityConstruct, OmpMetadirectiveDirective>
       u;
 };
 
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 36d926a..1d997ab 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -754,12 +754,12 @@ public:
       // OpenMP data-copying attribute
       OmpCopyIn, OmpCopyPrivate,
       // OpenMP miscellaneous flags
-      OmpCommonBlock, OmpReduction, OmpAligned, OmpNontemporal, OmpAllocate,
-      OmpDeclarativeAllocateDirective, OmpExecutableAllocateDirective,
-      OmpDeclareSimd, OmpDeclareTarget, OmpThreadprivate, OmpDeclareReduction,
-      OmpFlushed, OmpCriticalLock, OmpIfSpecified, OmpNone, OmpPreDetermined,
-      OmpImplicit, OmpDependObject, OmpInclusiveScan, OmpExclusiveScan,
-      OmpInScanReduction);
+      OmpCommonBlock, OmpReduction, OmpInReduction, OmpAligned, OmpNontemporal,
+      OmpAllocate, OmpDeclarativeAllocateDirective,
+      OmpExecutableAllocateDirective, OmpDeclareSimd, OmpDeclareTarget,
+      OmpThreadprivate, OmpDeclareReduction, OmpFlushed, OmpCriticalLock,
+      OmpIfSpecified, OmpNone, OmpPreDetermined, OmpImplicit, OmpDependObject,
+      OmpInclusiveScan, OmpExclusiveScan, OmpInScanReduction);
   using Flags = common::EnumSet<Flag, Flag_enumSize>;
 
   const Scope &owner() const { return *owner_; }
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 28f2f69..238079a 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1615,13 +1615,10 @@ void CompilerInvocation::setDefaultPredefinitions() {
   }
 
   llvm::Triple targetTriple{llvm::Triple(this->targetOpts.triple)};
-  if (targetTriple.isPPC()) {
-    // '__powerpc__' is a generic macro for any PowerPC cases. e.g. Max integer
-    // size.
-    fortranOptions.predefinitions.emplace_back("__powerpc__", "1");
-  }
   if (targetTriple.isOSLinux()) {
     fortranOptions.predefinitions.emplace_back("__linux__", "1");
+  } else if (targetTriple.isOSAIX()) {
+    fortranOptions.predefinitions.emplace_back("_AIX", "1");
   }
 
   switch (targetTriple.getArch()) {
@@ -1631,6 +1628,16 @@ void CompilerInvocation::setDefaultPredefinitions() {
     fortranOptions.predefinitions.emplace_back("__x86_64__", "1");
     fortranOptions.predefinitions.emplace_back("__x86_64", "1");
     break;
+  case llvm::Triple::ArchType::ppc:
+  case llvm::Triple::ArchType::ppc64:
+  case llvm::Triple::ArchType::ppcle:
+  case llvm::Triple::ArchType::ppc64le:
+    // '__powerpc__' is a generic macro for any PowerPC.
+    fortranOptions.predefinitions.emplace_back("__powerpc__", "1");
+    if (targetTriple.isOSAIX() && targetTriple.isArch64Bit()) {
+      fortranOptions.predefinitions.emplace_back("__64BIT__", "1");
+    }
+    break;
   }
 }
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 72c63e4..8da0525 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -94,10 +94,11 @@ struct IncrementLoopInfo {
   template <typename T>
   explicit IncrementLoopInfo(Fortran::semantics::Symbol &sym, const T &lower,
                              const T &upper, const std::optional<T> &step,
-                             bool isUnordered = false)
+                             bool isConcurrent = false)
       : loopVariableSym{&sym}, lowerExpr{Fortran::semantics::GetExpr(lower)},
         upperExpr{Fortran::semantics::GetExpr(upper)},
-        stepExpr{Fortran::semantics::GetExpr(step)}, isUnordered{isUnordered} {}
+        stepExpr{Fortran::semantics::GetExpr(step)},
+        isConcurrent{isConcurrent} {}
 
   IncrementLoopInfo(IncrementLoopInfo &&) = default;
   IncrementLoopInfo &operator=(IncrementLoopInfo &&x) = default;
@@ -120,7 +121,7 @@ struct IncrementLoopInfo {
   const Fortran::lower::SomeExpr *upperExpr;
   const Fortran::lower::SomeExpr *stepExpr;
   const Fortran::lower::SomeExpr *maskExpr = nullptr;
-  bool isUnordered; // do concurrent, forall
+  bool isConcurrent;
   llvm::SmallVector<const Fortran::semantics::Symbol *> localSymList;
   llvm::SmallVector<const Fortran::semantics::Symbol *> localInitSymList;
   llvm::SmallVector<
@@ -130,7 +131,7 @@ struct IncrementLoopInfo {
   mlir::Value loopVariable = nullptr;
 
   // Data members for structured loops.
-  fir::DoLoopOp doLoop = nullptr;
+  mlir::Operation *loopOp = nullptr;
 
   // Data members for unstructured loops.
   bool hasRealControl = false;
@@ -1981,7 +1982,7 @@ private:
     llvm_unreachable("illegal reduction operator");
   }
 
-  /// Collect DO CONCURRENT or FORALL loop control information.
+  /// Collect DO CONCURRENT loop control information.
   IncrementLoopNestInfo getConcurrentControl(
       const Fortran::parser::ConcurrentHeader &header,
       const std::list<Fortran::parser::LocalitySpec> &localityList = {}) {
@@ -2292,8 +2293,14 @@ private:
     mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get(
         builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua,
         /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
-    if (has_attrs)
-      info.doLoop.setLoopAnnotationAttr(la);
+    if (has_attrs) {
+      if (auto loopOp = mlir::dyn_cast<fir::DoLoopOp>(info.loopOp))
+        loopOp.setLoopAnnotationAttr(la);
+
+      if (auto doConcurrentOp =
+              mlir::dyn_cast<fir::DoConcurrentLoopOp>(info.loopOp))
+        doConcurrentOp.setLoopAnnotationAttr(la);
+    }
   }
 
   /// Generate FIR to begin a structured or unstructured increment loop nest.
@@ -2302,96 +2309,77 @@ private:
       llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs) {
     assert(!incrementLoopNestInfo.empty() && "empty loop nest");
     mlir::Location loc = toLocation();
-    mlir::Operation *boundsAndStepIP = nullptr;
     mlir::arith::IntegerOverflowFlags iofBackup{};
 
+    llvm::SmallVector<mlir::Value> nestLBs;
+    llvm::SmallVector<mlir::Value> nestUBs;
+    llvm::SmallVector<mlir::Value> nestSts;
+    llvm::SmallVector<mlir::Value> nestReduceOperands;
+    llvm::SmallVector<mlir::Attribute> nestReduceAttrs;
+    bool genDoConcurrent = false;
+
     for (IncrementLoopInfo &info : incrementLoopNestInfo) {
-      mlir::Value lowerValue;
-      mlir::Value upperValue;
-      mlir::Value stepValue;
+      genDoConcurrent = info.isStructured() && info.isConcurrent;
 
-      {
-        mlir::OpBuilder::InsertionGuard guard(*builder);
+      if (!genDoConcurrent)
+        info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
+                                                   info.isConcurrent);
 
-        // Set the IP before the first loop in the nest so that all nest bounds
-        // and step values are created outside the nest.
-        if (boundsAndStepIP)
-          builder->setInsertionPointAfter(boundsAndStepIP);
+      if (!getLoweringOptions().getIntegerWrapAround()) {
+        iofBackup = builder->getIntegerOverflowFlags();
+        builder->setIntegerOverflowFlags(
+            mlir::arith::IntegerOverflowFlags::nsw);
+      }
 
-        info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
-                                                   info.isUnordered);
-        if (!getLoweringOptions().getIntegerWrapAround()) {
-          iofBackup = builder->getIntegerOverflowFlags();
-          builder->setIntegerOverflowFlags(
-              mlir::arith::IntegerOverflowFlags::nsw);
-        }
-        lowerValue = genControlValue(info.lowerExpr, info);
-        upperValue = genControlValue(info.upperExpr, info);
-        bool isConst = true;
-        stepValue = genControlValue(info.stepExpr, info,
-                                    info.isStructured() ? nullptr : &isConst);
-        if (!getLoweringOptions().getIntegerWrapAround())
-          builder->setIntegerOverflowFlags(iofBackup);
-        boundsAndStepIP = stepValue.getDefiningOp();
-
-        // Use a temp variable for unstructured loops with non-const step.
-        if (!isConst) {
-          info.stepVariable =
-              builder->createTemporary(loc, stepValue.getType());
-          boundsAndStepIP =
-              builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
+      nestLBs.push_back(genControlValue(info.lowerExpr, info));
+      nestUBs.push_back(genControlValue(info.upperExpr, info));
+      bool isConst = true;
+      nestSts.push_back(genControlValue(
+          info.stepExpr, info, info.isStructured() ? nullptr : &isConst));
+
+      if (!getLoweringOptions().getIntegerWrapAround())
+        builder->setIntegerOverflowFlags(iofBackup);
+
+      // Use a temp variable for unstructured loops with non-const step.
+      if (!isConst) {
+        mlir::Value stepValue = nestSts.back();
+        info.stepVariable = builder->createTemporary(loc, stepValue.getType());
+        builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
+      }
+
+      if (genDoConcurrent && nestReduceOperands.empty()) {
+        // Create DO CONCURRENT reduce operands and attributes
+        for (const auto &reduceSym : info.reduceSymList) {
+          const fir::ReduceOperationEnum reduceOperation = reduceSym.first;
+          const Fortran::semantics::Symbol *sym = reduceSym.second;
+          fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
+          nestReduceOperands.push_back(fir::getBase(exv));
+          auto reduceAttr =
+              fir::ReduceAttr::get(builder->getContext(), reduceOperation);
+          nestReduceAttrs.push_back(reduceAttr);
         }
       }
+    }
 
+    for (auto [info, lowerValue, upperValue, stepValue] :
+         llvm::zip_equal(incrementLoopNestInfo, nestLBs, nestUBs, nestSts)) {
       // Structured loop - generate fir.do_loop.
       if (info.isStructured()) {
+        if (genDoConcurrent)
+          continue;
+
+        // The loop variable is a doLoop op argument.
         mlir::Type loopVarType = info.getLoopVariableType();
-        mlir::Value loopValue;
-        if (info.isUnordered) {
-          llvm::SmallVector<mlir::Value> reduceOperands;
-          llvm::SmallVector<mlir::Attribute> reduceAttrs;
-          // Create DO CONCURRENT reduce operands and attributes
-          for (const auto &reduceSym : info.reduceSymList) {
-            const fir::ReduceOperationEnum reduce_operation = reduceSym.first;
-            const Fortran::semantics::Symbol *sym = reduceSym.second;
-            fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
-            reduceOperands.push_back(fir::getBase(exv));
-            auto reduce_attr =
-                fir::ReduceAttr::get(builder->getContext(), reduce_operation);
-            reduceAttrs.push_back(reduce_attr);
-          }
-          // The loop variable value is explicitly updated.
-          info.doLoop = builder->create<fir::DoLoopOp>(
-              loc, lowerValue, upperValue, stepValue, /*unordered=*/true,
-              /*finalCountValue=*/false, /*iterArgs=*/std::nullopt,
-              llvm::ArrayRef<mlir::Value>(reduceOperands), reduceAttrs);
-          builder->setInsertionPointToStart(info.doLoop.getBody());
-          loopValue = builder->createConvert(loc, loopVarType,
-                                             info.doLoop.getInductionVar());
-        } else {
-          // The loop variable is a doLoop op argument.
-          info.doLoop = builder->create<fir::DoLoopOp>(
-              loc, lowerValue, upperValue, stepValue, /*unordered=*/false,
-              /*finalCountValue=*/true,
-              builder->createConvert(loc, loopVarType, lowerValue));
-          builder->setInsertionPointToStart(info.doLoop.getBody());
-          loopValue = info.doLoop.getRegionIterArgs()[0];
-        }
+        auto loopOp = builder->create<fir::DoLoopOp>(
+            loc, lowerValue, upperValue, stepValue, /*unordered=*/false,
+            /*finalCountValue=*/true,
+            builder->createConvert(loc, loopVarType, lowerValue));
+        info.loopOp = loopOp;
+        builder->setInsertionPointToStart(loopOp.getBody());
+        mlir::Value loopValue = loopOp.getRegionIterArgs()[0];
+
         // Update the loop variable value in case it has non-index references.
         builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);
-        if (info.maskExpr) {
-          Fortran::lower::StatementContext stmtCtx;
-          mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx);
-          stmtCtx.finalizeAndReset();
-          mlir::Value maskCondCast =
-              builder->createConvert(loc, builder->getI1Type(), maskCond);
-          auto ifOp = builder->create<fir::IfOp>(loc, maskCondCast,
-                                                 /*withElseRegion=*/false);
-          builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
-        }
-        if (info.hasLocalitySpecs())
-          handleLocalitySpecs(info);
-
         addLoopAnnotationAttr(info, dirs);
         continue;
       }
@@ -2455,6 +2443,60 @@ private:
         builder->restoreInsertionPoint(insertPt);
       }
     }
+
+    if (genDoConcurrent) {
+      auto loopWrapperOp = builder->create<fir::DoConcurrentOp>(loc);
+      builder->setInsertionPointToStart(
+          builder->createBlock(&loopWrapperOp.getRegion()));
+
+      for (IncrementLoopInfo &info : llvm::reverse(incrementLoopNestInfo)) {
+        info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
+                                                   info.isConcurrent);
+      }
+
+      builder->setInsertionPointToEnd(loopWrapperOp.getBody());
+      auto loopOp = builder->create<fir::DoConcurrentLoopOp>(
+          loc, nestLBs, nestUBs, nestSts, nestReduceOperands,
+          nestReduceAttrs.empty()
+              ? nullptr
+              : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs),
+          nullptr);
+
+      llvm::SmallVector<mlir::Type> loopBlockArgTypes(
+          incrementLoopNestInfo.size(), builder->getIndexType());
+      llvm::SmallVector<mlir::Location> loopBlockArgLocs(
+          incrementLoopNestInfo.size(), loc);
+      mlir::Region &loopRegion = loopOp.getRegion();
+      mlir::Block *loopBlock = builder->createBlock(
+          &loopRegion, loopRegion.begin(), loopBlockArgTypes, loopBlockArgLocs);
+      builder->setInsertionPointToStart(loopBlock);
+
+      for (auto [info, blockArg] :
+           llvm::zip_equal(incrementLoopNestInfo, loopBlock->getArguments())) {
+        info.loopOp = loopOp;
+        mlir::Value loopValue =
+            builder->createConvert(loc, info.getLoopVariableType(), blockArg);
+        builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);
+
+        if (info.maskExpr) {
+          Fortran::lower::StatementContext stmtCtx;
+          mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx);
+          stmtCtx.finalizeAndReset();
+          mlir::Value maskCondCast =
+              builder->createConvert(loc, builder->getI1Type(), maskCond);
+          auto ifOp = builder->create<fir::IfOp>(loc, maskCondCast,
+                                                 /*withElseRegion=*/false);
+          builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
+        }
+      }
+
+      IncrementLoopInfo &innermostInfo = incrementLoopNestInfo.back();
+
+      if (innermostInfo.hasLocalitySpecs())
+        handleLocalitySpecs(innermostInfo);
+
+      addLoopAnnotationAttr(innermostInfo, dirs);
+    }
   }
 
   /// Generate FIR to end a structured or unstructured increment loop nest.
@@ -2471,29 +2513,31 @@ private:
          it != rend; ++it) {
       IncrementLoopInfo &info = *it;
       if (info.isStructured()) {
-        // End fir.do_loop.
-        if (info.isUnordered) {
-          builder->setInsertionPointAfter(info.doLoop);
+        // End fir.do_concurent.loop.
+        if (info.isConcurrent) {
+          builder->setInsertionPointAfter(info.loopOp->getParentOp());
           continue;
         }
+
+        // End fir.do_loop.
         // Decrement tripVariable.
-        builder->setInsertionPointToEnd(info.doLoop.getBody());
+        auto doLoopOp = mlir::cast<fir::DoLoopOp>(info.loopOp);
+        builder->setInsertionPointToEnd(doLoopOp.getBody());
         llvm::SmallVector<mlir::Value, 2> results;
         results.push_back(builder->create<mlir::arith::AddIOp>(
-            loc, info.doLoop.getInductionVar(), info.doLoop.getStep(),
-            iofAttr));
+            loc, doLoopOp.getInductionVar(), doLoopOp.getStep(), iofAttr));
         // Step loopVariable to help optimizations such as vectorization.
         // Induction variable elimination will clean up as necessary.
         mlir::Value step = builder->createConvert(
-            loc, info.getLoopVariableType(), info.doLoop.getStep());
+            loc, info.getLoopVariableType(), doLoopOp.getStep());
         mlir::Value loopVar =
             builder->create<fir::LoadOp>(loc, info.loopVariable);
         results.push_back(
             builder->create<mlir::arith::AddIOp>(loc, loopVar, step, iofAttr));
         builder->create<fir::ResultOp>(loc, results);
-        builder->setInsertionPointAfter(info.doLoop);
+        builder->setInsertionPointAfter(doLoopOp);
         // The loop control variable may be used after the loop.
-        builder->create<fir::StoreOp>(loc, info.doLoop.getResult(1),
+        builder->create<fir::StoreOp>(loc, doLoopOp.getResult(1),
                                       info.loopVariable);
         continue;
       }
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index b277c0d..372c71b6 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -145,11 +145,10 @@ static bool isConstant(const Fortran::semantics::Symbol &sym) {
          sym.test(Fortran::semantics::Symbol::Flag::ReadOnly);
 }
 
-static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
-                                  const Fortran::lower::pft::Variable &var,
-                                  llvm::StringRef globalName,
-                                  mlir::StringAttr linkage,
-                                  cuf::DataAttributeAttr dataAttr = {});
+/// Call \p genInit to generate code inside \p global initializer region.
+static void
+createGlobalInitialization(fir::FirOpBuilder &builder, fir::GlobalOp global,
+                           std::function<void(fir::FirOpBuilder &)> genInit);
 
 static mlir::Location genLocation(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::semantics::Symbol &sym) {
@@ -467,9 +466,9 @@ static bool globalIsInitialized(fir::GlobalOp global) {
 }
 
 /// Call \p genInit to generate code inside \p global initializer region.
-void Fortran::lower::createGlobalInitialization(
-    fir::FirOpBuilder &builder, fir::GlobalOp global,
-    std::function<void(fir::FirOpBuilder &)> genInit) {
+static void
+createGlobalInitialization(fir::FirOpBuilder &builder, fir::GlobalOp global,
+                           std::function<void(fir::FirOpBuilder &)> genInit) {
   mlir::Region &region = global.getRegion();
   region.push_back(new mlir::Block);
   mlir::Block &block = region.back();
@@ -479,7 +478,7 @@ void Fortran::lower::createGlobalInitialization(
   builder.restoreInsertionPoint(insertPt);
 }
 
-static unsigned getAllocatorIdx(cuf::DataAttributeAttr dataAttr) {
+static unsigned getAllocatorIdxFromDataAttr(cuf::DataAttributeAttr dataAttr) {
   if (dataAttr) {
     if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
       return kPinnedAllocatorPos;
@@ -494,11 +493,10 @@ static unsigned getAllocatorIdx(cuf::DataAttributeAttr dataAttr) {
 }
 
 /// Create the global op and its init if it has one
-static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
-                                  const Fortran::lower::pft::Variable &var,
-                                  llvm::StringRef globalName,
-                                  mlir::StringAttr linkage,
-                                  cuf::DataAttributeAttr dataAttr) {
+fir::GlobalOp Fortran::lower::defineGlobal(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::lower::pft::Variable &var, llvm::StringRef globalName,
+    mlir::StringAttr linkage, cuf::DataAttributeAttr dataAttr) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   const Fortran::semantics::Symbol &sym = var.getSymbol();
   mlir::Location loc = genLocation(converter, sym);
@@ -545,27 +543,25 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
         sym.detailsIf<Fortran::semantics::ObjectEntityDetails>();
     if (details && details->init()) {
       auto expr = *details->init();
-      Fortran::lower::createGlobalInitialization(
-          builder, global, [&](fir::FirOpBuilder &b) {
-            mlir::Value box = Fortran::lower::genInitialDataTarget(
-                converter, loc, symTy, expr);
-            b.create<fir::HasValueOp>(loc, box);
-          });
+      createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
+        mlir::Value box =
+            Fortran::lower::genInitialDataTarget(converter, loc, symTy, expr);
+        b.create<fir::HasValueOp>(loc, box);
+      });
     } else {
       // Create unallocated/disassociated descriptor if no explicit init
-      Fortran::lower::createGlobalInitialization(
-          builder, global, [&](fir::FirOpBuilder &b) {
-            mlir::Value box = fir::factory::createUnallocatedBox(
-                b, loc, symTy,
-                /*nonDeferredParams=*/std::nullopt,
-                /*typeSourceBox=*/{}, getAllocatorIdx(dataAttr));
-            b.create<fir::HasValueOp>(loc, box);
-          });
+      createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
+        mlir::Value box = fir::factory::createUnallocatedBox(
+            b, loc, symTy,
+            /*nonDeferredParams=*/std::nullopt,
+            /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
+        b.create<fir::HasValueOp>(loc, box);
+      });
     }
   } else if (const auto *details =
                  sym.detailsIf<Fortran::semantics::ObjectEntityDetails>()) {
     if (details->init()) {
-      Fortran::lower::createGlobalInitialization(
+      createGlobalInitialization(
           builder, global, [&](fir::FirOpBuilder &builder) {
             Fortran::lower::StatementContext stmtCtx(
                 /*cleanupProhibited=*/true);
@@ -576,7 +572,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
             builder.create<fir::HasValueOp>(loc, castTo);
           });
     } else if (Fortran::lower::hasDefaultInitialization(sym)) {
-      Fortran::lower::createGlobalInitialization(
+      createGlobalInitialization(
           builder, global, [&](fir::FirOpBuilder &builder) {
             Fortran::lower::StatementContext stmtCtx(
                 /*cleanupProhibited=*/true);
@@ -591,7 +587,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
     if (details && details->init()) {
       auto sym{*details->init()};
       if (sym) // Has a procedure target.
-        Fortran::lower::createGlobalInitialization(
+        createGlobalInitialization(
             builder, global, [&](fir::FirOpBuilder &b) {
               Fortran::lower::StatementContext stmtCtx(
                   /*cleanupProhibited=*/true);
@@ -601,19 +597,17 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
               b.create<fir::HasValueOp>(loc, castTo);
             });
       else { // Has NULL() target.
-        Fortran::lower::createGlobalInitialization(
-            builder, global, [&](fir::FirOpBuilder &b) {
-              auto box{fir::factory::createNullBoxProc(b, loc, symTy)};
-              b.create<fir::HasValueOp>(loc, box);
-            });
+        createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
+          auto box{fir::factory::createNullBoxProc(b, loc, symTy)};
+          b.create<fir::HasValueOp>(loc, box);
+        });
       }
     } else {
       // No initialization.
-      Fortran::lower::createGlobalInitialization(
-          builder, global, [&](fir::FirOpBuilder &b) {
-            auto box{fir::factory::createNullBoxProc(b, loc, symTy)};
-            b.create<fir::HasValueOp>(loc, box);
-          });
+      createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
+        auto box{fir::factory::createNullBoxProc(b, loc, symTy)};
+        b.create<fir::HasValueOp>(loc, box);
+      });
     }
   } else if (sym.has<Fortran::semantics::CommonBlockDetails>()) {
     mlir::emitError(loc, "COMMON symbol processed elsewhere");
@@ -634,7 +628,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
     // file.
     if (sym.attrs().test(Fortran::semantics::Attr::BIND_C))
       global.setLinkName(builder.createCommonLinkage());
-    Fortran::lower::createGlobalInitialization(
+    createGlobalInitialization(
         builder, global, [&](fir::FirOpBuilder &builder) {
           mlir::Value initValue;
           if (converter.getLoweringOptions().getInitGlobalZero())
@@ -826,7 +820,7 @@ void Fortran::lower::defaultInitializeAtRuntime(
                                       /*isConst=*/true,
                                       /*isTarget=*/false,
                                       /*dataAttr=*/{});
-        Fortran::lower::createGlobalInitialization(
+        createGlobalInitialization(
             builder, global, [&](fir::FirOpBuilder &builder) {
               Fortran::lower::StatementContext stmtCtx(
                   /*cleanupProhibited=*/true);
@@ -842,7 +836,7 @@ void Fortran::lower::defaultInitializeAtRuntime(
                                       /*isConst=*/true,
                                       /*isTarget=*/false,
                                       /*dataAttr=*/{});
-        Fortran::lower::createGlobalInitialization(
+        createGlobalInitialization(
             builder, global, [&](fir::FirOpBuilder &builder) {
               Fortran::lower::StatementContext stmtCtx(
                   /*cleanupProhibited=*/true);
@@ -1207,7 +1201,7 @@ static fir::GlobalOp defineGlobalAggregateStore(
     if (const auto *objectDetails =
             initSym->detailsIf<Fortran::semantics::ObjectEntityDetails>())
       if (objectDetails->init()) {
-        Fortran::lower::createGlobalInitialization(
+        createGlobalInitialization(
             builder, global, [&](fir::FirOpBuilder &builder) {
               Fortran::lower::StatementContext stmtCtx;
               mlir::Value initVal = fir::getBase(genInitializerExprValue(
@@ -1219,12 +1213,11 @@ static fir::GlobalOp defineGlobalAggregateStore(
   // Equivalence has no Fortran initial value. Create an undefined FIR initial
   // value to ensure this is consider an object definition in the IR regardless
   // of the linkage.
-  Fortran::lower::createGlobalInitialization(
-      builder, global, [&](fir::FirOpBuilder &builder) {
-        Fortran::lower::StatementContext stmtCtx;
-        mlir::Value initVal = builder.create<fir::ZeroOp>(loc, aggTy);
-        builder.create<fir::HasValueOp>(loc, initVal);
-      });
+  createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &builder) {
+    Fortran::lower::StatementContext stmtCtx;
+    mlir::Value initVal = builder.create<fir::ZeroOp>(loc, aggTy);
+    builder.create<fir::HasValueOp>(loc, initVal);
+  });
   return global;
 }
 
@@ -1543,7 +1536,7 @@ static void finalizeCommonBlockDefinition(
     LLVM_DEBUG(llvm::dbgs() << "}\n");
     builder.create<fir::HasValueOp>(loc, cb);
   };
-  Fortran::lower::createGlobalInitialization(builder, global, initFunc);
+  createGlobalInitialization(builder, global, initFunc);
 }
 
 void Fortran::lower::defineCommonBlocks(
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 77b4622..318455f 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -983,6 +983,29 @@ bool ClauseProcessor::processIf(
   });
   return found;
 }
+bool ClauseProcessor::processInReduction(
+    mlir::Location currentLocation, mlir::omp::InReductionClauseOps &result,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const {
+  return findRepeatableClause<omp::clause::InReduction>(
+      [&](const omp::clause::InReduction &clause, const parser::CharBlock &) {
+        llvm::SmallVector<mlir::Value> inReductionVars;
+        llvm::SmallVector<bool> inReduceVarByRef;
+        llvm::SmallVector<mlir::Attribute> inReductionDeclSymbols;
+        llvm::SmallVector<const semantics::Symbol *> inReductionSyms;
+        ReductionProcessor rp;
+        rp.processReductionArguments<omp::clause::InReduction>(
+            currentLocation, converter, clause, inReductionVars,
+            inReduceVarByRef, inReductionDeclSymbols, inReductionSyms);
+
+        // Copy local lists into the output.
+        llvm::copy(inReductionVars, std::back_inserter(result.inReductionVars));
+        llvm::copy(inReduceVarByRef,
+                   std::back_inserter(result.inReductionByref));
+        llvm::copy(inReductionDeclSymbols,
+                   std::back_inserter(result.inReductionSyms));
+        llvm::copy(inReductionSyms, std::back_inserter(outReductionSyms));
+      });
+}
 
 bool ClauseProcessor::processIsDevicePtr(
     mlir::omp::IsDevicePtrClauseOps &result,
@@ -1257,9 +1280,9 @@ bool ClauseProcessor::processReduction(
         llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
         llvm::SmallVector<const semantics::Symbol *> reductionSyms;
         ReductionProcessor rp;
-        rp.processReductionArguments(
+        rp.processReductionArguments<omp::clause::Reduction>(
             currentLocation, converter, clause, reductionVars, reduceVarByRef,
-            reductionDeclSymbols, reductionSyms, result.reductionMod);
+            reductionDeclSymbols, reductionSyms, &result.reductionMod);
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
         llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref));
@@ -1269,6 +1292,30 @@ bool ClauseProcessor::processReduction(
       });
 }
 
+bool ClauseProcessor::processTaskReduction(
+    mlir::Location currentLocation, mlir::omp::TaskReductionClauseOps &result,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const {
+  return findRepeatableClause<omp::clause::TaskReduction>(
+      [&](const omp::clause::TaskReduction &clause, const parser::CharBlock &) {
+        llvm::SmallVector<mlir::Value> taskReductionVars;
+        llvm::SmallVector<bool> TaskReduceVarByRef;
+        llvm::SmallVector<mlir::Attribute> TaskReductionDeclSymbols;
+        llvm::SmallVector<const semantics::Symbol *> TaskReductionSyms;
+        ReductionProcessor rp;
+        rp.processReductionArguments<omp::clause::TaskReduction>(
+            currentLocation, converter, clause, taskReductionVars,
+            TaskReduceVarByRef, TaskReductionDeclSymbols, TaskReductionSyms);
+        // Copy local lists into the output.
+        llvm::copy(taskReductionVars,
+                   std::back_inserter(result.taskReductionVars));
+        llvm::copy(TaskReduceVarByRef,
+                   std::back_inserter(result.taskReductionByref));
+        llvm::copy(TaskReductionDeclSymbols,
+                   std::back_inserter(result.taskReductionSyms));
+        llvm::copy(TaskReductionSyms, std::back_inserter(outReductionSyms));
+      });
+}
+
 bool ClauseProcessor::processTo(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const {
   return findRepeatableClause<omp::clause::To>(
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index bdddeb1..3d3f26f 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -112,6 +112,9 @@ public:
   processEnter(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
   bool processIf(omp::clause::If::DirectiveNameModifier directiveName,
                  mlir::omp::IfClauseOps &result) const;
+  bool processInReduction(
+      mlir::Location currentLocation, mlir::omp::InReductionClauseOps &result,
+      llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const;
   bool processIsDevicePtr(
       mlir::omp::IsDevicePtrClauseOps &result,
       llvm::SmallVectorImpl<const semantics::Symbol *> &isDeviceSyms) const;
@@ -133,6 +136,9 @@ public:
   bool processReduction(
       mlir::Location currentLocation, mlir::omp::ReductionClauseOps &result,
       llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) const;
+  bool processTaskReduction(
+      mlir::Location currentLocation, mlir::omp::TaskReductionClauseOps &result,
+      llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const;
   bool processTo(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
   bool processUseDeviceAddr(
       lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index fcd3de9..cc793c6 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -662,32 +662,9 @@ static fir::GlobalOp globalInitialization(lower::AbstractConverter &converter,
                                           const semantics::Symbol &sym,
                                           const lower::pft::Variable &var,
                                           mlir::Location currentLocation) {
-  mlir::Type ty = converter.genType(sym);
   std::string globalName = converter.mangleName(sym);
   mlir::StringAttr linkage = firOpBuilder.createInternalLinkage();
-  fir::GlobalOp global =
-      firOpBuilder.createGlobal(currentLocation, ty, globalName, linkage);
-
-  // Create default initialization for non-character scalar.
-  if (semantics::IsAllocatableOrObjectPointer(&sym)) {
-    mlir::Type baseAddrType = mlir::dyn_cast<fir::BoxType>(ty).getEleTy();
-    lower::createGlobalInitialization(
-        firOpBuilder, global, [&](fir::FirOpBuilder &b) {
-          mlir::Value nullAddr =
-              b.createNullConstant(currentLocation, baseAddrType);
-          mlir::Value box =
-              b.create<fir::EmboxOp>(currentLocation, ty, nullAddr);
-          b.create<fir::HasValueOp>(currentLocation, box);
-        });
-  } else {
-    lower::createGlobalInitialization(
-        firOpBuilder, global, [&](fir::FirOpBuilder &b) {
-          mlir::Value undef = b.create<fir::UndefOp>(currentLocation, ty);
-          b.create<fir::HasValueOp>(currentLocation, undef);
-        });
-  }
-
-  return global;
+  return Fortran::lower::defineGlobal(converter, var, globalName, linkage);
 }
 
 // Get the extended value for \p val by extracting additional variable
@@ -1774,34 +1751,34 @@ static void genTargetEnterExitUpdateDataClauses(
   cp.processNowait(clauseOps);
 }
 
-static void genTaskClauses(lower::AbstractConverter &converter,
-                           semantics::SemanticsContext &semaCtx,
-                           lower::SymMap &symTable,
-                           lower::StatementContext &stmtCtx,
-                           const List<Clause> &clauses, mlir::Location loc,
-                           mlir::omp::TaskOperands &clauseOps) {
+static void genTaskClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    lower::SymMap &symTable, lower::StatementContext &stmtCtx,
+    const List<Clause> &clauses, mlir::Location loc,
+    mlir::omp::TaskOperands &clauseOps,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &inReductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDepend(symTable, stmtCtx, clauseOps);
   cp.processFinal(stmtCtx, clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps);
+  cp.processInReduction(loc, clauseOps, inReductionSyms);
   cp.processMergeable(clauseOps);
   cp.processPriority(stmtCtx, clauseOps);
   cp.processUntied(clauseOps);
   cp.processDetach(clauseOps);
 
-  cp.processTODO<clause::Affinity, clause::InReduction>(
-      loc, llvm::omp::Directive::OMPD_task);
+  cp.processTODO<clause::Affinity>(loc, llvm::omp::Directive::OMPD_task);
 }
 
-static void genTaskgroupClauses(lower::AbstractConverter &converter,
-                                semantics::SemanticsContext &semaCtx,
-                                const List<Clause> &clauses, mlir::Location loc,
-                                mlir::omp::TaskgroupOperands &clauseOps) {
+static void genTaskgroupClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    const List<Clause> &clauses, mlir::Location loc,
+    mlir::omp::TaskgroupOperands &clauseOps,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &taskReductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
-  cp.processTODO<clause::TaskReduction>(loc,
-                                        llvm::omp::Directive::OMPD_taskgroup);
+  cp.processTaskReduction(loc, clauseOps, taskReductionSyms);
 }
 
 static void genTaskloopClauses(lower::AbstractConverter &converter,
@@ -2496,8 +2473,9 @@ genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
           mlir::Location loc, const ConstructQueue &queue,
           ConstructQueue::const_iterator item) {
   mlir::omp::TaskOperands clauseOps;
+  llvm::SmallVector<const semantics::Symbol *> inReductionSyms;
   genTaskClauses(converter, semaCtx, symTable, stmtCtx, item->clauses, loc,
-                 clauseOps);
+                 clauseOps, inReductionSyms);
 
   if (!enableDelayedPrivatization)
     return genOpWithBody<mlir::omp::TaskOp>(
@@ -2514,6 +2492,8 @@ genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   EntryBlockArgs taskArgs;
   taskArgs.priv.syms = dsp.getDelayedPrivSymbols();
   taskArgs.priv.vars = clauseOps.privateVars;
+  taskArgs.inReduction.syms = inReductionSyms;
+  taskArgs.inReduction.vars = clauseOps.inReductionVars;
 
   return genOpWithBody<mlir::omp::TaskOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
@@ -2531,12 +2511,19 @@ genTaskgroupOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                const ConstructQueue &queue,
                ConstructQueue::const_iterator item) {
   mlir::omp::TaskgroupOperands clauseOps;
-  genTaskgroupClauses(converter, semaCtx, item->clauses, loc, clauseOps);
+  llvm::SmallVector<const semantics::Symbol *> taskReductionSyms;
+  genTaskgroupClauses(converter, semaCtx, item->clauses, loc, clauseOps,
+                      taskReductionSyms);
+
+  EntryBlockArgs taskgroupArgs;
+  taskgroupArgs.taskReduction.syms = taskReductionSyms;
+  taskgroupArgs.taskReduction.vars = clauseOps.taskReductionVars;
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_taskgroup)
-          .setClauses(&item->clauses),
+          .setClauses(&item->clauses)
+          .setEntryBlockArgs(&taskgroupArgs),
       queue, item, clauseOps);
 }
 
@@ -2848,9 +2835,23 @@ static void genAtomicUpdateStatement(
     lower::StatementContext atomicStmtCtx;
     mlir::Value rhsExpr = fir::getBase(converter.genExprValue(
         *semantics::GetExpr(assignmentStmtExpr), atomicStmtCtx));
-    mlir::Value convertResult =
-        firOpBuilder.createConvert(currentLocation, varType, rhsExpr);
-    firOpBuilder.create<mlir::omp::YieldOp>(currentLocation, convertResult);
+    mlir::Type exprType = fir::unwrapRefType(rhsExpr.getType());
+    if (fir::isa_complex(exprType) && !fir::isa_complex(varType)) {
+      // Emit an additional `ExtractValueOp` if the expression is of complex
+      // type
+      auto extract = firOpBuilder.create<fir::ExtractValueOp>(
+          currentLocation,
+          mlir::cast<mlir::ComplexType>(exprType).getElementType(), rhsExpr,
+          firOpBuilder.getArrayAttr(
+              firOpBuilder.getIntegerAttr(firOpBuilder.getIndexType(), 0)));
+      mlir::Value convertResult = firOpBuilder.create<fir::ConvertOp>(
+          currentLocation, varType, extract);
+      firOpBuilder.create<mlir::omp::YieldOp>(currentLocation, convertResult);
+    } else {
+      mlir::Value convertResult =
+          firOpBuilder.createConvert(currentLocation, varType, rhsExpr);
+      firOpBuilder.create<mlir::omp::YieldOp>(currentLocation, convertResult);
+    }
     converter.resetExprOverrides();
   }
   firOpBuilder.setInsertionPointAfter(atomicUpdateOp);
@@ -3792,6 +3793,13 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
 }
 
+static void
+genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+       semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
+       const parser::OmpDeclareVariantDirective &declareVariantDirective) {
+  TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
+}
+
 static void genOMP(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 729bd36..b8aa0de 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -25,6 +25,7 @@
 #include "flang/Parser/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "llvm/Support/CommandLine.h"
+#include <type_traits>
 
 static llvm::cl::opt<bool> forceByrefReduction(
     "force-byref-reduction",
@@ -38,6 +39,37 @@ namespace Fortran {
 namespace lower {
 namespace omp {
 
+// explicit template declarations
+template void
+ReductionProcessor::processReductionArguments<omp::clause::Reduction>(
+    mlir::Location currentLocation, lower::AbstractConverter &converter,
+    const omp::clause::Reduction &reduction,
+    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+    llvm::SmallVectorImpl<bool> &reduceVarByRef,
+    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+    mlir::omp::ReductionModifierAttr *reductionMod);
+
+template void
+ReductionProcessor::processReductionArguments<omp::clause::TaskReduction>(
+    mlir::Location currentLocation, lower::AbstractConverter &converter,
+    const omp::clause::TaskReduction &reduction,
+    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+    llvm::SmallVectorImpl<bool> &reduceVarByRef,
+    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+    mlir::omp::ReductionModifierAttr *reductionMod);
+
+template void
+ReductionProcessor::processReductionArguments<omp::clause::InReduction>(
+    mlir::Location currentLocation, lower::AbstractConverter &converter,
+    const omp::clause::InReduction &reduction,
+    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+    llvm::SmallVectorImpl<bool> &reduceVarByRef,
+    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
+    mlir::omp::ReductionModifierAttr *reductionMod);
+
 ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
     const omp::clause::ProcedureDesignator &pd) {
   auto redType = llvm::StringSwitch<std::optional<ReductionIdentifier>>(
@@ -538,28 +570,30 @@ mlir::omp::ReductionModifier translateReductionModifier(ReductionModifier mod) {
   return mlir::omp::ReductionModifier::defaultmod;
 }
 
+template <class T>
 void ReductionProcessor::processReductionArguments(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
-    const omp::clause::Reduction &reduction,
-    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+    const T &reduction, llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
-    mlir::omp::ReductionModifierAttr &reductionMod) {
+    mlir::omp::ReductionModifierAttr *reductionMod) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  auto mod = std::get<std::optional<ReductionModifier>>(reduction.t);
-  if (mod.has_value()) {
-    if (mod.value() == ReductionModifier::Task)
-      TODO(currentLocation, "Reduction modifier `task` is not supported");
-    else
-      reductionMod = mlir::omp::ReductionModifierAttr::get(
-          firOpBuilder.getContext(), translateReductionModifier(mod.value()));
+  if constexpr (std::is_same_v<T, omp::clause::Reduction>) {
+    auto mod = std::get<std::optional<ReductionModifier>>(reduction.t);
+    if (mod.has_value()) {
+      if (mod.value() == ReductionModifier::Task)
+        TODO(currentLocation, "Reduction modifier `task` is not supported");
+      else
+        *reductionMod = mlir::omp::ReductionModifierAttr::get(
+            firOpBuilder.getContext(), translateReductionModifier(mod.value()));
+    }
   }
 
   mlir::omp::DeclareReductionOp decl;
   const auto &redOperatorList{
-      std::get<omp::clause::Reduction::ReductionIdentifiers>(reduction.t)};
+      std::get<typename T::ReductionIdentifiers>(reduction.t)};
   assert(redOperatorList.size() == 1 && "Expecting single operator");
   const auto &redOperator = redOperatorList.front();
   const auto &objectList{std::get<omp::ObjectList>(reduction.t)};
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h
index 11baa83..a7198b4 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.h
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h
@@ -121,14 +121,14 @@ public:
 
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
+  template <class T>
   static void processReductionArguments(
       mlir::Location currentLocation, lower::AbstractConverter &converter,
-      const omp::clause::Reduction &reduction,
-      llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+      const T &reduction, llvm::SmallVectorImpl<mlir::Value> &reductionVars,
       llvm::SmallVectorImpl<bool> &reduceVarByRef,
       llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
       llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
-      mlir::omp::ReductionModifierAttr &reductionMod);
+      mlir::omp::ReductionModifierAttr *reductionMod = nullptr);
 };
 
 template <typename FloatOp, typename IntegerOp>
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 1d6e150..86166db3 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -280,6 +280,9 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
   if (auto cufKernelOp = getRegion().getParentOfType<cuf::KernelOp>())
     return &cufKernelOp.getRegion().front();
 
+  if (auto doConcurentOp = getRegion().getParentOfType<fir::DoConcurrentOp>())
+    return doConcurentOp.getBody();
+
   return getEntryBlock();
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 05ef691..955acbe 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4910,6 +4910,105 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns(
 }
 
 //===----------------------------------------------------------------------===//
+// LocalitySpecifierOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult fir::LocalitySpecifierOp::verifyRegions() {
+  mlir::Type argType = getArgType();
+  auto verifyTerminator = [&](mlir::Operation *terminator,
+                              bool yieldsValue) -> llvm::LogicalResult {
+    if (!terminator->getBlock()->getSuccessors().empty())
+      return llvm::success();
+
+    if (!llvm::isa<fir::YieldOp>(terminator))
+      return mlir::emitError(terminator->getLoc())
+             << "expected exit block terminator to be an `fir.yield` op.";
+
+    YieldOp yieldOp = llvm::cast<YieldOp>(terminator);
+    mlir::TypeRange yieldedTypes = yieldOp.getResults().getTypes();
+
+    if (!yieldsValue) {
+      if (yieldedTypes.empty())
+        return llvm::success();
+
+      return mlir::emitError(terminator->getLoc())
+             << "Did not expect any values to be yielded.";
+    }
+
+    if (yieldedTypes.size() == 1 && yieldedTypes.front() == argType)
+      return llvm::success();
+
+    auto error = mlir::emitError(yieldOp.getLoc())
+                 << "Invalid yielded value. Expected type: " << argType
+                 << ", got: ";
+
+    if (yieldedTypes.empty())
+      error << "None";
+    else
+      error << yieldedTypes;
+
+    return error;
+  };
+
+  auto verifyRegion = [&](mlir::Region &region, unsigned expectedNumArgs,
+                          llvm::StringRef regionName,
+                          bool yieldsValue) -> llvm::LogicalResult {
+    assert(!region.empty());
+
+    if (region.getNumArguments() != expectedNumArgs)
+      return mlir::emitError(region.getLoc())
+             << "`" << regionName << "`: "
+             << "expected " << expectedNumArgs
+             << " region arguments, got: " << region.getNumArguments();
+
+    for (mlir::Block &block : region) {
+      // MLIR will verify the absence of the terminator for us.
+      if (!block.mightHaveTerminator())
+        continue;
+
+      if (failed(verifyTerminator(block.getTerminator(), yieldsValue)))
+        return llvm::failure();
+    }
+
+    return llvm::success();
+  };
+
+  // Ensure all of the region arguments have the same type
+  for (mlir::Region *region : getRegions())
+    for (mlir::Type ty : region->getArgumentTypes())
+      if (ty != argType)
+        return emitError() << "Region argument type mismatch: got " << ty
+                           << " expected " << argType << ".";
+
+  mlir::Region &initRegion = getInitRegion();
+  if (!initRegion.empty() &&
+      failed(verifyRegion(getInitRegion(), /*expectedNumArgs=*/2, "init",
+                          /*yieldsValue=*/true)))
+    return llvm::failure();
+
+  LocalitySpecifierType dsType = getLocalitySpecifierType();
+
+  if (dsType == LocalitySpecifierType::Local && !getCopyRegion().empty())
+    return emitError("`local` specifiers do not require a `copy` region.");
+
+  if (dsType == LocalitySpecifierType::LocalInit && getCopyRegion().empty())
+    return emitError(
+        "`local_init` specifiers require at least a `copy` region.");
+
+  if (dsType == LocalitySpecifierType::LocalInit &&
+      failed(verifyRegion(getCopyRegion(), /*expectedNumArgs=*/2, "copy",
+                          /*yieldsValue=*/true)))
+    return llvm::failure();
+
+  if (!getDeallocRegion().empty() &&
+      failed(verifyRegion(getDeallocRegion(), /*expectedNumArgs=*/1, "dealloc",
+                          /*yieldsValue=*/false)))
+    return llvm::failure();
+
+  return llvm::success();
+}
+
+//===----------------------------------------------------------------------===//
 // DoConcurrentOp
 //===----------------------------------------------------------------------===//
 
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index bfca4e3..c4728e0 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -611,6 +611,14 @@ TYPE_PARSER(sourced(construct<OmpToClause::Modifier>(
 TYPE_PARSER(sourced(construct<OmpWhenClause::Modifier>( //
     Parser<OmpContextSelector>{})))
 
+TYPE_PARSER(construct<OmpAppendArgsClause::OmpAppendOp>(
+    "INTEROP" >> parenthesized(nonemptyList(Parser<OmpInteropType>{}))))
+
+TYPE_PARSER(construct<OmpAdjustArgsClause::OmpAdjustOp>(
+    "NOTHING" >> pure(OmpAdjustArgsClause::OmpAdjustOp::Value::Nothing) ||
+    "NEED_DEVICE_PTR" >>
+        pure(OmpAdjustArgsClause::OmpAdjustOp::Value::Need_Device_Ptr)))
+
 // --- Parsers for clauses --------------------------------------------
 
 /// `MOBClause` is a clause that has a
@@ -630,6 +638,10 @@ static inline MOBClause makeMobClause(
   }
 }
 
+TYPE_PARSER(construct<OmpAdjustArgsClause>(
+    (Parser<OmpAdjustArgsClause::OmpAdjustOp>{} / ":"),
+    Parser<OmpObjectList>{}))
+
 // [5.0] 2.10.1 affinity([aff-modifier:] locator-list)
 //              aff-modifier: interator-modifier
 TYPE_PARSER(construct<OmpAffinityClause>(
@@ -653,6 +665,9 @@ TYPE_PARSER(construct<OmpAtomicDefaultMemOrderClause>(
 TYPE_PARSER(construct<OmpCancellationConstructTypeClause>(
     OmpDirectiveNameParser{}, maybe(parenthesized(scalarLogicalExpr))))
 
+TYPE_PARSER(construct<OmpAppendArgsClause>(
+    nonemptyList(Parser<OmpAppendArgsClause::OmpAppendOp>{})))
+
 // 2.15.3.1 DEFAULT (PRIVATE | FIRSTPRIVATE | SHARED | NONE)
 TYPE_PARSER(construct<OmpDefaultClause::DataSharingAttribute>(
     "PRIVATE" >> pure(OmpDefaultClause::DataSharingAttribute::Private) ||
@@ -901,6 +916,8 @@ TYPE_PARSER( //
                     parenthesized(Parser<OmpAbsentClause>{}))) ||
     "ACQUIRE" >> construct<OmpClause>(construct<OmpClause::Acquire>()) ||
     "ACQ_REL" >> construct<OmpClause>(construct<OmpClause::AcqRel>()) ||
+    "ADJUST_ARGS" >> construct<OmpClause>(construct<OmpClause::AdjustArgs>(
+                         parenthesized(Parser<OmpAdjustArgsClause>{}))) ||
     "AFFINITY" >> construct<OmpClause>(construct<OmpClause::Affinity>(
                       parenthesized(Parser<OmpAffinityClause>{}))) ||
     "ALIGN" >> construct<OmpClause>(construct<OmpClause::Align>(
@@ -909,6 +926,8 @@ TYPE_PARSER( //
                      parenthesized(Parser<OmpAlignedClause>{}))) ||
     "ALLOCATE" >> construct<OmpClause>(construct<OmpClause::Allocate>(
                       parenthesized(Parser<OmpAllocateClause>{}))) ||
+    "APPEND_ARGS" >> construct<OmpClause>(construct<OmpClause::AppendArgs>(
+                         parenthesized(Parser<OmpAppendArgsClause>{}))) ||
     "ALLOCATOR" >> construct<OmpClause>(construct<OmpClause::Allocator>(
                        parenthesized(scalarIntExpr))) ||
     "AT" >> construct<OmpClause>(construct<OmpClause::At>(
@@ -1348,6 +1367,11 @@ TYPE_PARSER(construct<OmpInitializerClause>(
     construct<OmpInitializerClause>(assignmentStmt) ||
     construct<OmpInitializerClause>(Parser<OmpInitializerProc>{})))
 
+// OpenMP 5.2: 7.5.4 Declare Variant directive
+TYPE_PARSER(sourced(
+    construct<OmpDeclareVariantDirective>(verbatim("DECLARE VARIANT"_tok),
+        "(" >> maybe(name / ":"), name / ")", Parser<OmpClauseList>{})))
+
 // 2.16 Declare Reduction Construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareReductionConstruct>(
     verbatim("DECLARE REDUCTION"_tok),
@@ -1520,6 +1544,8 @@ TYPE_PARSER(
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OpenMPDeclareTargetConstruct>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
+                                Parser<OmpDeclareVariantDirective>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
                                 Parser<OpenMPDeclarativeAllocate>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OpenMPRequiresConstruct>{}) ||
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 5ac5982..1ee9096 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2743,7 +2743,28 @@ public:
     Put("\n");
     EndOpenMP();
   }
-
+  void Unparse(const OmpAppendArgsClause::OmpAppendOp &x) {
+    Put("INTEROP(");
+    Walk(x.v, ",");
+    Put(")");
+  }
+  void Unparse(const OmpAppendArgsClause &x) { Walk(x.v, ","); }
+  void Unparse(const OmpAdjustArgsClause &x) {
+    Walk(std::get<OmpAdjustArgsClause::OmpAdjustOp>(x.t).v);
+    Put(":");
+    Walk(std::get<parser::OmpObjectList>(x.t));
+  }
+  void Unparse(const OmpDeclareVariantDirective &x) {
+    BeginOpenMP();
+    Word("!$OMP DECLARE VARIANT ");
+    Put("(");
+    Walk(std::get<std::optional<Name>>(x.t), ":");
+    Walk(std::get<Name>(x.t));
+    Put(")");
+    Walk(std::get<OmpClauseList>(x.t));
+    Put("\n");
+    EndOpenMP();
+  }
   void Unparse(const OpenMPInteropConstruct &x) {
     BeginOpenMP();
     Word("!$OMP INTEROP");
@@ -3042,6 +3063,7 @@ public:
   WALK_NESTED_ENUM(InquireSpec::LogVar, Kind)
   WALK_NESTED_ENUM(ProcedureStmt, Kind) // R1506
   WALK_NESTED_ENUM(UseStmt, ModuleNature) // R1410
+  WALK_NESTED_ENUM(OmpAdjustArgsClause::OmpAdjustOp, Value) // OMP adjustop
   WALK_NESTED_ENUM(OmpAtClause, ActionTime) // OMP at
   WALK_NESTED_ENUM(OmpBindClause, Binding) // OMP bind
   WALK_NESTED_ENUM(OmpProcBindClause, AffinityPolicy) // OMP proc_bind
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index dfaa0e0..1192886 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -1016,7 +1016,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       }
     }
     if (dummyDataAttr == common::CUDADataAttr::Device &&
-        (dummyIsAssumedShape || dummyIsAssumedRank)) {
+        (dummyIsAssumedShape || dummyIsAssumedRank) &&
+        !dummy.ignoreTKR.test(common::IgnoreTKR::Contiguous)) {
       if (auto contig{evaluate::IsContiguous(actual, foldingContext,
               /*namedConstantSectionsAreContiguous=*/true,
               /*firstDimensionStride1=*/true)}) {
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index f654fe6..f17de42 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1619,6 +1619,16 @@ void OmpStructureChecker::Leave(const parser::OpenMPDeclareSimdConstruct &) {
   dirContext_.pop_back();
 }
 
+void OmpStructureChecker::Enter(const parser::OmpDeclareVariantDirective &x) {
+  const auto &dir{std::get<parser::Verbatim>(x.t)};
+  PushContextAndClauseSets(
+      dir.source, llvm::omp::Directive::OMPD_declare_variant);
+}
+
+void OmpStructureChecker::Leave(const parser::OmpDeclareVariantDirective &) {
+  dirContext_.pop_back();
+}
+
 void OmpStructureChecker::Enter(const parser::OpenMPDepobjConstruct &x) {
   const auto &dirName{std::get<parser::OmpDirectiveName>(x.v.t)};
   PushContextAndClauseSets(dirName.source, llvm::omp::Directive::OMPD_depobj);
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 5ea2039..911a6bb 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -98,6 +98,8 @@ public:
   void Enter(const parser::OmpEndSectionsDirective &);
   void Leave(const parser::OmpEndSectionsDirective &);
 
+  void Enter(const parser::OmpDeclareVariantDirective &);
+  void Leave(const parser::OmpDeclareVariantDirective &);
   void Enter(const parser::OpenMPDeclareSimdConstruct &);
   void Leave(const parser::OpenMPDeclareSimdConstruct &);
   void Enter(const parser::OpenMPDeclarativeAllocate &);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 620a37c..8b1caca 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -530,6 +530,12 @@ public:
     return false;
   }
 
+  bool Pre(const parser::OmpInReductionClause &x) {
+    auto &objects{std::get<parser::OmpObjectList>(x.t)};
+    ResolveOmpObjectList(objects, Symbol::Flag::OmpInReduction);
+    return false;
+  }
+
   bool Pre(const parser::OmpClause::Reduction &x) {
     const auto &objList{std::get<parser::OmpObjectList>(x.v.t)};
     ResolveOmpObjectList(objList, Symbol::Flag::OmpReduction);
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e0550b3..b297969 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1511,6 +1511,25 @@ public:
     return true;
   }
 
+  bool Pre(const parser::OmpDeclareVariantDirective &x) {
+    AddOmpSourceRange(x.source);
+    auto FindSymbolOrError = [&](const parser::Name &procName) {
+      auto *symbol{FindSymbol(NonDerivedTypeScope(), procName)};
+      if (!symbol) {
+        context().Say(procName.source,
+            "Implicit subroutine declaration '%s' in !$OMP DECLARE VARIANT"_err_en_US,
+            procName.source);
+      }
+    };
+    auto &baseProcName = std::get<std::optional<parser::Name>>(x.t);
+    if (baseProcName) {
+      FindSymbolOrError(*baseProcName);
+    }
+    auto &varProcName = std::get<parser::Name>(x.t);
+    FindSymbolOrError(varProcName);
+    return true;
+  }
+
   bool Pre(const parser::OpenMPDeclareReductionConstruct &x) {
     AddOmpSourceRange(x.source);
     ProcessReductionSpecifier(
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 9bd90bc..f8a30da 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -17,9 +17,8 @@ implicit none
 
   ! Synchronization Functions
 
-  interface
-    attributes(device) subroutine syncthreads()
-    end subroutine
+  interface syncthreads
+    procedure :: syncthreads
   end interface
 
   interface
@@ -1614,4 +1613,9 @@ implicit none
     end function
   end interface
 
+contains
+
+  attributes(device) subroutine syncthreads()
+  end subroutine
+
 end module
diff --git a/flang/test/Driver/mcmodel.f90 b/flang/test/Driver/mcmodel.f90
index 12d90ec..8a03b17 100644
--- a/flang/test/Driver/mcmodel.f90
+++ b/flang/test/Driver/mcmodel.f90
@@ -1,5 +1,4 @@
 ! RUN: not %flang -### -c --target=i686 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=ERR-MEDIUM %s
-! RUN: %flang --target=x86_64 -### -c -mcmodel=tiny %s 2>&1 | FileCheck --check-prefix=TINY %s
 ! RUN: %flang --target=x86_64 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
 ! RUN: %flang --target=x86_64 -### -S -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=KERNEL %s
 ! RUN: %flang --target=x86_64 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
@@ -41,4 +40,3 @@
 
 ! AARCH64-PIC-LARGE: error: invalid argument '-mcmodel=large' only allowed with '-fno-pic'
 ! ERR-AARCH64_32: error: unsupported argument 'small' to option '-mcmodel=' for target 'aarch64_32-unknown-linux'
-
diff --git a/flang/test/Driver/predefined-macros-powerpc2.f90 b/flang/test/Driver/predefined-macros-powerpc2.f90
index 6e10235..6d235af 100644
--- a/flang/test/Driver/predefined-macros-powerpc2.f90
+++ b/flang/test/Driver/predefined-macros-powerpc2.f90
@@ -1,13 +1,25 @@
 ! Test predefined macro for PowerPC architecture
 
-! RUN: %flang_fc1 -triple ppc64le-unknown-linux -cpp -E %s | FileCheck %s
+! RUN: %flang_fc1 -triple ppc64le-unknown-linux -cpp -E %s | FileCheck %s -check-prefix=CHECK-LINUX
+! RUN: %flang_fc1 -triple powerpc-unknown-aix -cpp -E %s | FileCheck %s -check-prefix=CHECK-AIX32
+! RUN: %flang_fc1 -triple powerpc64-unknown-aix -cpp -E %s | FileCheck %s -check-prefix=CHECK-AIX64
 ! REQUIRES: target=powerpc{{.*}}
 
-! CHECK: integer :: var1 = 1
-! CHECK: integer :: var2 = 1
+! CHECK-LINUX: integer :: var1 = 1
+! CHECK-LINUX: integer :: var2 = 1
+! CHECK-AIX32: integer :: var1 = 1
+! CHECK-AIX32: integer :: var2 = 1
+! CHECK-AIX32: integer :: var3 = __64BIT__
+! CHECK-AIX64: integer :: var1 = 1
+! CHECK-AIX64: integer :: var2 = 1
+! CHECK-AIX64: integer :: var3 = 1
 
 #if defined(__linux__) && defined(__powerpc__)
   integer :: var1 = __powerpc__
   integer :: var2 = __linux__
+#elif defined(_AIX) && defined(__powerpc__)
+  integer :: var1 = __powerpc__
+  integer :: var2 = _AIX
+  integer :: var3 = __64BIT__
 #endif
 end program
diff --git a/flang/test/Fir/do_concurrent.fir b/flang/test/Fir/do_concurrent.fir
index 8e80ffb..4e55777 100644
--- a/flang/test/Fir/do_concurrent.fir
+++ b/flang/test/Fir/do_concurrent.fir
@@ -90,3 +90,22 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
 // CHECK:             fir.store %[[J_IV_CVT]] to %[[J]] : !fir.ref<i32>
 // CHECK:           }
 // CHECK:         }
+
+
+fir.local {type = local} @local_privatizer : i32
+
+// CHECK:   fir.local {type = local} @[[LOCAL_PRIV_SYM:local_privatizer]] : i32
+
+fir.local {type = local_init} @local_init_privatizer : i32 copy {
+^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+    %0 = fir.load %arg0 : !fir.ref<i32>
+    fir.store %0 to %arg1 : !fir.ref<i32>
+    fir.yield(%arg1 : !fir.ref<i32>)
+}
+
+// CHECK:   fir.local {type = local_init} @[[LOCAL_INIT_PRIV_SYM:local_init_privatizer]] : i32
+// CHECK:   ^bb0(%[[ORIG_VAL:.*]]: !fir.ref<i32>, %[[LOCAL_VAL:.*]]: !fir.ref<i32>):
+// CHECK:      %[[ORIG_VAL_LD:.*]] = fir.load %[[ORIG_VAL]]
+// CHECK:      fir.store %[[ORIG_VAL_LD]] to %[[LOCAL_VAL]] : !fir.ref<i32>
+// CHECK:      fir.yield(%[[LOCAL_VAL]] : !fir.ref<i32>)
+// CHECK:   }
diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir
index f9f5e26..7332273 100644
--- a/flang/test/Fir/invalid.fir
+++ b/flang/test/Fir/invalid.fir
@@ -1,5 +1,3 @@
-
-
 // RUN: fir-opt -split-input-file -verify-diagnostics --strict-fir-volatile-verifier %s
 
 // expected-error@+1{{custom op 'fir.string_lit' must have character type}}
@@ -1311,3 +1309,79 @@ func.func @bad_convert_volatile6(%arg0: !fir.ref<i32>) -> !fir.ref<i64> {
   %0 = fir.volatile_cast %arg0 : (!fir.ref<i32>) -> !fir.ref<i64>
   return %0 : !fir.ref<i64>
 }
+
+// -----
+
+fir.local {type = local} @x.localizer : i32 init {
+^bb0(%arg0: i32, %arg1: i32):
+  %0 = arith.constant 0.0 : f32
+  // expected-error @below {{Invalid yielded value. Expected type: 'i32', got: 'f32'}}
+  fir.yield(%0 : f32)
+}
+
+// -----
+
+// expected-error @below {{Region argument type mismatch: got 'f32' expected 'i32'.}}
+fir.local {type = local} @x.localizer : i32 init {
+^bb0(%arg0: i32, %arg1: f32):
+  fir.yield
+}
+
+// -----
+
+fir.local {type = local} @x.localizer : f32 init {
+^bb0(%arg0: f32, %arg1: f32):
+  fir.yield(%arg0: f32)
+} dealloc {
+^bb0(%arg0: f32):
+  // expected-error @below {{Did not expect any values to be yielded.}}
+  fir.yield(%arg0 : f32)
+}
+
+// -----
+
+fir.local {type = local} @x.localizer : i32 init {
+^bb0(%arg0: i32, %arg1: i32):
+  // expected-error @below {{expected exit block terminator to be an `fir.yield` op.}}
+  fir.unreachable
+}
+
+// -----
+
+// expected-error @below {{`init`: expected 2 region arguments, got: 1}}
+fir.local {type = local} @x.localizer : f32 init {
+^bb0(%arg0: f32):
+  fir.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`copy`: expected 2 region arguments, got: 1}}
+fir.local {type = local_init} @x.privatizer : f32 copy {
+^bb0(%arg0: f32):
+  fir.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`dealloc`: expected 1 region arguments, got: 2}}
+fir.local {type = local} @x.localizer : f32 dealloc {
+^bb0(%arg0: f32, %arg1: f32):
+  fir.yield
+}
+
+// -----
+
+// expected-error @below {{`local` specifiers do not require a `copy` region.}}
+fir.local {type = local} @x.localizer : f32 copy {
+^bb0(%arg0: f32, %arg1 : f32):
+  fir.yield(%arg0 : f32)
+}
+
+// -----
+
+// expected-error @below {{`local_init` specifiers require at least a `copy` region.}}
+fir.local {type = local_init} @x.localizer : f32 init {
+^bb0(%arg0: f32, %arg1: f32):
+  fir.yield(%arg0 : f32)
+}
diff --git a/flang/test/Lower/OpenMP/Todo/declare-variant.f90 b/flang/test/Lower/OpenMP/Todo/declare-variant.f90
new file mode 100644
index 0000000..5719ef3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/declare-variant.f90
@@ -0,0 +1,17 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: OmpDeclareVariantDirective
+
+subroutine sb1
+  integer :: x
+  x = 1
+  call sub(x)
+contains
+  subroutine vsub (v1)
+    integer, value :: v1
+  end
+  subroutine sub (v1)
+    !$omp declare variant(vsub), match(construct={dispatch})
+    integer, value :: v1
+  end
+end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/task-inreduction.f90
deleted file mode 100644
index aeed680..0000000
--- a/flang/test/Lower/OpenMP/Todo/task-inreduction.f90
+++ /dev/null
@@ -1,15 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `mergeable` clause
-!===============================================================================
-
-! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASK construct
-subroutine omp_task_in_reduction()
-  integer i
-  i = 0
-  !$omp task in_reduction(+:i)
-  i = i + 1
-  !$omp end task
-end subroutine omp_task_in_reduction
diff --git a/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90
deleted file mode 100644
index 1cb471d..0000000
--- a/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90
+++ /dev/null
@@ -1,10 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause TASK_REDUCTION in TASKGROUP construct
-subroutine omp_taskgroup_task_reduction
-  integer :: res
-  !$omp taskgroup task_reduction(+:res)
-  res = res + 1
-  !$omp end taskgroup
-end subroutine omp_taskgroup_task_reduction
diff --git a/flang/test/Lower/OpenMP/atomic-update.f90 b/flang/test/Lower/OpenMP/atomic-update.f90
index 31bf447..257ae8f 100644
--- a/flang/test/Lower/OpenMP/atomic-update.f90
+++ b/flang/test/Lower/OpenMP/atomic-update.f90
@@ -20,6 +20,8 @@ program OmpAtomicUpdate
 !CHECK: %[[VAL_C_DECLARE:.*]]:2 = hlfir.declare %[[VAL_C_ADDRESS]] {{.*}}
 !CHECK: %[[VAL_D_ADDRESS:.*]] = fir.address_of(@_QFEd) : !fir.ref<i32>
 !CHECK: %[[VAL_D_DECLARE:.*]]:2 = hlfir.declare %[[VAL_D_ADDRESS]] {{.}}
+!CHECK: %[[VAL_G_ADDRESS:.*]] = fir.alloca complex<f32> {bindc_name = "g", uniq_name = "_QFEg"}
+!CHECK: %[[VAL_G_DECLARE:.*]]:2 = hlfir.declare %[[VAL_G_ADDRESS]] {uniq_name = "_QFEg"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
 !CHECK: %[[VAL_i1_ALLOCA:.*]] = fir.alloca i8 {bindc_name = "i1", uniq_name = "_QFEi1"}
 !CHECK: %[[VAL_i1_DECLARE:.*]]:2 = hlfir.declare %[[VAL_i1_ALLOCA]] {{.*}}
 !CHECK: %[[VAL_c5:.*]] = arith.constant 5 : index
@@ -40,6 +42,7 @@ program OmpAtomicUpdate
     integer, target :: c, d
     integer(1) :: i1
     integer, dimension(5) :: k
+    complex :: g
 
 !CHECK: %[[EMBOX:.*]] = fir.embox %[[VAL_C_DECLARE]]#0 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
 !CHECK: fir.store %[[EMBOX]] to %[[VAL_A_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
@@ -200,4 +203,19 @@ program OmpAtomicUpdate
 !CHECK:  }
   !$omp atomic update
     x = x + sum([ (y+2, y=1, z) ])
+
+!CHECK: %[[LOAD:.*]] = fir.load %[[VAL_G_DECLARE]]#0 : !fir.ref<complex<f32>>
+!CHECK: omp.atomic.update %[[VAL_W_DECLARE]]#0 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[CVT:.*]] = fir.convert %[[ARG]] : (i32) -> f32
+!CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK: %[[UNDEF:.*]] = fir.undefined complex<f32>
+!CHECK: %[[IDX1:.*]] = fir.insert_value %[[UNDEF]], %[[CVT]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+!CHECK: %[[IDX2:.*]] = fir.insert_value %[[IDX1]], %[[CST]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+!CHECK: %[[ADD:.*]] = fir.addc %[[IDX2]], %[[LOAD]] {fastmath = #arith.fastmath<contract>} : complex<f32>
+!CHECK: %[[EXT:.*]] = fir.extract_value %[[ADD]], [0 : index] : (complex<f32>) -> f32
+!CHECK: %[[RESULT:.*]] = fir.convert %[[EXT]] : (f32) -> i32
+!CHECK: omp.yield(%[[RESULT]] : i32)
+  !$omp atomic update
+    w = w + g  
 end program OmpAtomicUpdate
diff --git a/flang/test/Lower/OpenMP/omp-declare-target-program-var.f90 b/flang/test/Lower/OpenMP/omp-declare-target-program-var.f90
index 20538ff..d18f42a 100644
--- a/flang/test/Lower/OpenMP/omp-declare-target-program-var.f90
+++ b/flang/test/Lower/OpenMP/omp-declare-target-program-var.f90
@@ -6,7 +6,7 @@ PROGRAM main
     ! HOST-DAG: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFEi"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
     REAL :: I
     ! ALL-DAG: fir.global internal @_QFEi {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
-    ! ALL-DAG: %[[UNDEF:.*]] = fir.undefined f32
+    ! ALL-DAG: %[[UNDEF:.*]] = fir.zero_bits f32
     ! ALL-DAG: fir.has_value %[[UNDEF]] : f32
     ! ALL-DAG: }
     !$omp declare target(I)
diff --git a/flang/test/Lower/OpenMP/task-inreduction.f90 b/flang/test/Lower/OpenMP/task-inreduction.f90
new file mode 100644
index 0000000..41657d3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/task-inreduction.f90
@@ -0,0 +1,35 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.declare_reduction
+!CHECK-SAME:  @[[RED_I32_NAME:.*]] : i32 init {
+!CHECK:       ^bb0(%{{.*}}: i32):
+!CHECK:         %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:         omp.yield(%[[C0_1]] : i32)
+!CHECK:       } combiner {
+!CHECK:       ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:          %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+!CHECK:          omp.yield(%[[RES]] : i32)
+!CHECK:         }
+
+!CHECK-LABEL:  func.func @_QPomp_task_in_reduction() {
+!                [...]
+!CHECK:          omp.task in_reduction(@[[RED_I32_NAME]] %[[VAL_1:.*]]#0  -> %[[ARG0]] : !fir.ref<i32>) {
+!CHECK:            %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]]
+!CHECK-SAME:       {uniq_name = "_QFomp_task_in_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:            %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+!CHECK:            %[[VAL_6:.*]] = arith.constant 1 : i32
+!CHECK:            %[[VAL_7:.*]] = arith.addi %[[VAL_5]], %[[VAL_6]] : i32
+!CHECK:            hlfir.assign %[[VAL_7]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+!CHECK:            omp.terminator
+!CHECK:           }
+!CHECK:           return
+!CHECK:          }
+
+subroutine omp_task_in_reduction()
+   integer i
+   i = 0
+   !$omp task in_reduction(+:i)
+   i = i + 1
+   !$omp end task
+end subroutine omp_task_in_reduction
diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
new file mode 100644
index 0000000..18d4521
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
@@ -0,0 +1,49 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc {
+!                 [...]
+! CHECK:          omp.yield
+! CHECK-LABEL:  } init {
+!                 [...]
+! CHECK:          omp.yield
+! CHECK-LABEL:  } combiner {
+!                 [...]
+! CHECK:          omp.yield
+! CHECK-LABEL:  }  cleanup {
+!                  [...]
+! CHECK:           omp.yield
+! CHECK:  }
+
+! CHECK-LABEL:  func.func @_QPtask_reduction
+! CHECK-SAME:  (%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+! CHECK:          %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:          %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]]
+! CHECK-SAME:      {uniq_name = "_QFtask_reductionEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:          omp.parallel {
+! CHECK:            %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+! CHECK:            fir.store %[[VAL_2]]#1 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:            omp.taskgroup task_reduction(byref @add_reduction_byref_box_Uxf32 %[[VAL_3]] -> %[[VAL_4:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>) {
+! CHECK:              %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] 
+! CHECK-SAME:         {uniq_name = "_QFtask_reductionEx"} : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> (!fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.array<?xf32>>>)
+! CHECK:              omp.task in_reduction(byref @add_reduction_byref_box_Uxf32 %[[VAL_5]]#0 -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) {
+!                       [...]
+! CHECK:                omp.terminator
+! CHECK:               }
+! CHECK:               omp.terminator
+! CHECK:              }
+! CHECK:              omp.terminator
+! CHECK:             }
+! CHECK:             return
+! CHECK:           }
+
+subroutine task_reduction(x)
+   real, dimension(:) :: x
+   !$omp parallel
+   !$omp taskgroup task_reduction(+:x)
+   !$omp task in_reduction(+:x)
+   x = x + 1
+   !$omp end task
+   !$omp end taskgroup
+   !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/taskgroup-task_reduction01.f90 b/flang/test/Lower/OpenMP/taskgroup-task_reduction01.f90
new file mode 100644
index 0000000..be4d319
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskgroup-task_reduction01.f90
@@ -0,0 +1,36 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.declare_reduction
+!CHECK-SAME:  @[[RED_I32_NAME:.*]] : i32 init {
+!CHECK:       ^bb0(%{{.*}}: i32):
+!CHECK:         %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:         omp.yield(%[[C0_1]] : i32)
+!CHECK:       } combiner {
+!CHECK:      ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:        %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+!CHECK:        omp.yield(%[[RES]] : i32)
+!CHECK:       }
+
+!CHECK-LABEL: func.func @_QPomp_taskgroup_task_reduction() {
+!CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_taskgroup_task_reductionEres"}
+!CHECK:         %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFomp_taskgroup_task_reductionEres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:         omp.taskgroup task_reduction(@[[RED_I32_NAME]]  %[[VAL_1]]#0 -> %[[VAL_2:.*]] : !fir.ref<i32>) {
+!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] 
+!CHECK-SAME:      {uniq_name = "_QFomp_taskgroup_task_reductionEres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+!CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_4]], %[[VAL_5]] : i32
+!CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+!CHECK:           omp.terminator
+!CHECK:          }
+!CHECK:        return
+!CHECK:       }
+
+
+subroutine omp_taskgroup_task_reduction()
+   integer :: res
+   !$omp taskgroup task_reduction(+:res)
+   res = res + 1
+   !$omp end taskgroup
+end subroutine
diff --git a/flang/test/Lower/OpenMP/taskgroup-task_reduction02.f90 b/flang/test/Lower/OpenMP/taskgroup-task_reduction02.f90
new file mode 100644
index 0000000..ed91e58
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskgroup-task_reduction02.f90
@@ -0,0 +1,37 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.declare_reduction
+!CHECK-SAME:  @[[RED_I32_NAME:.*]] : i32 init {
+!CHECK:       ^bb0(%{{.*}}: i32):
+!CHECK:         %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:         omp.yield(%[[C0_1]] : i32)
+!CHECK:       } combiner {
+!CHECK:       ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:         %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+!CHECK:         omp.yield(%[[RES]] : i32)
+!CHECK:        }
+
+!CHECK-LABEL:  func.func @_QPin_reduction() {
+!                [...]
+!CHECK:          omp.taskgroup task_reduction(@[[RED_I32_NAME]] %[[VAL_1:.*]]#0 -> %[[VAL_3:.*]] : !fir.ref<i32>) {
+!CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFin_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:             omp.task in_reduction(@[[RED_I32_NAME]] %[[VAL_4]]#0 -> %[[VAL_5:.*]] : !fir.ref<i32>) {
+!CHECK:               %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFin_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!                     [...]
+!CHECK:               omp.terminator
+!CHECK:             }
+!CHECK:             omp.terminator
+!CHECK:          }
+!CHECK:          return
+!CHECK:         }
+
+subroutine in_reduction()
+   integer :: x
+   x = 0
+   !$omp taskgroup task_reduction(+:x)
+   !$omp task in_reduction(+:x)
+   x = x + 1
+   !$omp end task
+   !$omp end taskgroup
+end subroutine
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
index 546d492..5e54cef 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
@@ -27,7 +27,7 @@
 !CHECK:   return
 !CHECK: }
 !CHECK: fir.global internal @_QFEa : i32 {
-!CHECK:   %[[A:.*]] = fir.undefined i32
+!CHECK:   %[[A:.*]] = fir.zero_bits i32
 !CHECK:   fir.has_value %[[A]] : i32
 !CHECK: }
 
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
index 22ee51f..21547b4 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
@@ -27,7 +27,7 @@
 !CHECK:   return
 !CHECK: }
 !CHECK: fir.global internal @_QFEa : i32 {
-!CHECK:   %[[A:.*]] = fir.undefined i32
+!CHECK:   %[[A:.*]] = fir.zero_bits i32
 !CHECK:   fir.has_value %[[A]] : i32
 !CHECK: }
 
diff --git a/flang/test/Lower/OpenMP/threadprivate-lenparams.f90 b/flang/test/Lower/OpenMP/threadprivate-lenparams.f90
new file mode 100644
index 0000000..a220db2
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-lenparams.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! Regression test for https://github.com/llvm/llvm-project/issues/108136
+
+character(:), pointer :: c
+character(2), pointer :: c2
+!$omp threadprivate(c, c2)
+end
+
+! CHECK-LABEL:   fir.global internal @_QFEc : !fir.box<!fir.ptr<!fir.char<1,?>>> {
+! CHECK:           %[[VAL_0:.*]] = fir.zero_bits !fir.ptr<!fir.char<1,?>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_0]] typeparams %[[VAL_1]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.box<!fir.ptr<!fir.char<1,?>>>
+! CHECK:           fir.has_value %[[VAL_2]] : !fir.box<!fir.ptr<!fir.char<1,?>>>
+! CHECK:         }
+
+! CHECK-LABEL:   fir.global internal @_QFEc2 : !fir.box<!fir.ptr<!fir.char<1,2>>> {
+! CHECK:           %[[VAL_0:.*]] = fir.zero_bits !fir.ptr<!fir.char<1,2>>
+! CHECK:           %[[VAL_1:.*]] = fir.embox %[[VAL_0]] : (!fir.ptr<!fir.char<1,2>>) -> !fir.box<!fir.ptr<!fir.char<1,2>>>
+! CHECK:           fir.has_value %[[VAL_1]] : !fir.box<!fir.ptr<!fir.char<1,2>>>
+! CHECK:         }
+
diff --git a/flang/test/Lower/OpenMP/threadprivate-non-global.f90 b/flang/test/Lower/OpenMP/threadprivate-non-global.f90
index 0b9abd1..508a67de 100644
--- a/flang/test/Lower/OpenMP/threadprivate-non-global.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-non-global.f90
@@ -85,19 +85,19 @@ program test
 !CHECK-DAG:   fir.has_value [[E1]] : !fir.box<!fir.heap<f32>>
 !CHECK-DAG: }
 !CHECK-DAG: fir.global internal @_QFEw : complex<f32> {
-!CHECK-DAG:   [[Z2:%.*]] = fir.undefined complex<f32>
+!CHECK-DAG:   [[Z2:%.*]] = fir.zero_bits complex<f32>
 !CHECK-DAG:   fir.has_value [[Z2]] : complex<f32>
 !CHECK-DAG: }
 !CHECK-DAG: fir.global internal @_QFEx : i32 {
-!CHECK-DAG:   [[Z3:%.*]] = fir.undefined i32
+!CHECK-DAG:   [[Z3:%.*]] = fir.zero_bits i32
 !CHECK-DAG:   fir.has_value [[Z3]] : i32
 !CHECK-DAG: }
 !CHECK-DAG: fir.global internal @_QFEy : f32 {
-!CHECK-DAG:   [[Z4:%.*]] = fir.undefined f32
+!CHECK-DAG:   [[Z4:%.*]] = fir.zero_bits f32
 !CHECK-DAG:   fir.has_value [[Z4]] : f32
 !CHECK-DAG: }
 !CHECK-DAG: fir.global internal @_QFEz : !fir.logical<4> {
-!CHECK-DAG:   [[Z5:%.*]] = fir.undefined !fir.logical<4>
+!CHECK-DAG:   [[Z5:%.*]] = fir.zero_bits !fir.logical<4>
 !CHECK-DAG:   fir.has_value [[Z5]] : !fir.logical<4>
 !CHECK-DAG: }
 end
diff --git a/flang/test/Lower/do_concurrent.f90 b/flang/test/Lower/do_concurrent.f90
index ef93d2d..cc113f5 100644
--- a/flang/test/Lower/do_concurrent.f90
+++ b/flang/test/Lower/do_concurrent.f90
@@ -14,6 +14,9 @@ subroutine sub1(n)
    implicit none
    integer :: n, m, i, j, k
    integer, dimension(n) :: a
+!CHECK: %[[N_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFsub1En"}
+!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Ea"}
+
 !CHECK: %[[LB1:.*]] = arith.constant 1 : i32
 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
 !CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
@@ -29,10 +32,30 @@ subroutine sub1(n)
 !CHECK: %[[UB3:.*]] = arith.constant 10 : i32
 !CHECK: %[[UB3_CVT:.*]] = fir.convert %[[UB3]] : (i32) -> index
 
-!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
-!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
-!CHECK: fir.do_loop %{{.*}} = %[[LB3_CVT]] to %[[UB3_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_concurrent
+!CHECK:   %[[I:.*]] = fir.alloca i32 {bindc_name = "i"}
+!CHECK:   %[[I_DECL:.*]]:2 = hlfir.declare %[[I]]
+!CHECK:   %[[J:.*]] = fir.alloca i32 {bindc_name = "j"}
+!CHECK:   %[[J_DECL:.*]]:2 = hlfir.declare %[[J]]
+!CHECK:   %[[K:.*]] = fir.alloca i32 {bindc_name = "k"}
+!CHECK:   %[[K_DECL:.*]]:2 = hlfir.declare %[[K]]
+
+!CHECK:   fir.do_concurrent.loop (%[[I_IV:.*]], %[[J_IV:.*]], %[[K_IV:.*]]) =
+!CHECK-SAME:                     (%[[LB1_CVT]], %[[LB2_CVT]], %[[LB3_CVT]]) to
+!CHECK-SAME:                     (%[[UB1_CVT]], %[[UB2_CVT]], %[[UB3_CVT]]) step
+!CHECK-SAME:                     (%{{.*}}, %{{.*}}, %{{.*}}) {
+!CHECK:       %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
+!CHECK:       fir.store %[[I_IV_CVT]] to %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+!CHECK:       fir.store %[[J_IV_CVT]] to %[[J_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[K_IV_CVT:.*]] = fir.convert %[[K_IV]] : (index) -> i32
+!CHECK:       fir.store %[[K_IV_CVT]] to %[[K_DECL]]#0 : !fir.ref<i32>
 
+!CHECK:       %[[N_VAL:.*]] = fir.load %[[N_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[I_VAL:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK:       %[[I_VAL_CVT:.*]] = fir.convert %[[I_VAL]] : (i32) -> i64
+!CHECK:       %[[A_ELEM:.*]] = hlfir.designate %[[A_DECL]]#0 (%[[I_VAL_CVT]])
+!CHECK:       hlfir.assign %[[N_VAL]] to %[[A_ELEM]] : i32, !fir.ref<i32>
    do concurrent(i=1:n, j=1:bar(n*m, n/m), k=5:10)
       a(i) = n
    end do
@@ -45,14 +68,17 @@ subroutine sub2(n)
    integer, dimension(n) :: a
 !CHECK: %[[LB1:.*]] = arith.constant 1 : i32
 !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
-!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
+!CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
 !CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
-!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_concurrent
+!CHECK:   fir.do_concurrent.loop (%{{.*}}) = (%[[LB1_CVT]]) to (%[[UB1_CVT]]) step (%{{.*}})
+
 !CHECK: %[[LB2:.*]] = arith.constant 1 : i32
 !CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
 !CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
 !CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
-!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_concurrent
+!CHECK:   fir.do_concurrent.loop (%{{.*}}) = (%[[LB2_CVT]]) to (%[[UB2_CVT]]) step (%{{.*}})
    do concurrent(i=1:n)
       do concurrent(j=1:bar(n*m, n/m))
          a(i) = n
@@ -60,7 +86,6 @@ subroutine sub2(n)
    end do
 end subroutine
 
-
 !CHECK-LABEL: unstructured
 subroutine unstructured(inner_step)
   integer(4) :: i, j, inner_step
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index 7652e4f..207704a 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -29,7 +29,7 @@ end subroutine
 ! CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>> {fir.bindc_name = "p"}) {
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.box_elesize %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>) -> index
-! CHECK:           fir.do_loop
+! CHECK:           fir.do_concurrent.loop
 ! CHECK:             %[[VAL_16:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>> {bindc_name = "p", pinned, uniq_name = "_QFtest_ptrEp"}
 ! CHECK:             %[[VAL_17:.*]] = fir.zero_bits !fir.ptr<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:             %[[VAL_18:.*]] = arith.constant 0 : index
@@ -43,7 +43,7 @@ end subroutine
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QPtest_default_init(
-! CHECK:           fir.do_loop
+! CHECK:           fir.do_concurrent.loop
 ! CHECK:             %[[VAL_26:.*]] = fir.alloca !fir.type<_QFtest_default_initTt{i:i32}> {bindc_name = "a", pinned, uniq_name = "_QFtest_default_initEa"}
 ! CHECK:             %[[VAL_27:.*]] = fir.embox %[[VAL_26]] : (!fir.ref<!fir.type<_QFtest_default_initTt{i:i32}>>) -> !fir.box<!fir.type<_QFtest_default_initTt{i:i32}>>
 ! CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.box<!fir.type<_QFtest_default_initTt{i:i32}>>) -> !fir.box<none>
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index ea65ba3..60df27a 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -2,15 +2,6 @@
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
-  ! CHECK: %[[VAL_2:.*]] = fir.alloca i16 {bindc_name = "i"}
-  ! CHECK: %[[VAL_3:.*]] = fir.alloca i16 {bindc_name = "i"}
-  ! CHECK: %[[VAL_4:.*]] = fir.alloca i16 {bindc_name = "i"}
-  ! CHECK: %[[VAL_5:.*]] = fir.alloca i8 {bindc_name = "k"}
-  ! CHECK: %[[VAL_6:.*]] = fir.alloca i8 {bindc_name = "j"}
-  ! CHECK: %[[VAL_7:.*]] = fir.alloca i8 {bindc_name = "i"}
-  ! CHECK: %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "k"}
-  ! CHECK: %[[VAL_9:.*]] = fir.alloca i32 {bindc_name = "j"}
-  ! CHECK: %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i"}
   ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.array<5x5x5xi32> {bindc_name = "a", uniq_name = "_QFloop_testEa"}
   ! CHECK: %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "asum", uniq_name = "_QFloop_testEasum"}
   ! CHECK: %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFloop_testEi"}
@@ -25,7 +16,7 @@ subroutine loop_test
   j = 200
   k = 300
 
-  ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered
+  ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}}
   do concurrent (i=1:5, j=1:5, k=1:5) ! shared(a)
     ! CHECK: fir.coordinate_of
     a(i,j,k) = 0
@@ -33,7 +24,7 @@ subroutine loop_test
   ! CHECK: fir.call @_FortranAioBeginExternalListOutput
   print*, 'A:', i, j, k
 
-  ! CHECK-COUNT-3: fir.do_loop {{.*}} unordered
+  ! CHECK: fir.do_concurrent.loop (%{{.*}}, %{{.*}}, %{{.*}}) = {{.*}}
   ! CHECK: fir.if
   do concurrent (integer(1)::i=1:5, j=1:5, k=1:5, i.ne.j .and. k.ne.3) shared(a)
     ! CHECK-COUNT-2: fir.coordinate_of
@@ -53,7 +44,7 @@ subroutine loop_test
   ! CHECK: fir.call @_FortranAioBeginExternalListOutput
   print*, 'B:', i, j, k, '-', asum
 
-  ! CHECK: fir.do_loop {{.*}} unordered
+  ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}}
   ! CHECK-COUNT-2: fir.if
   do concurrent (integer(2)::i=1:5, i.ne.3)
     if (i.eq.2 .or. i.eq.4) goto 5 ! fir.if
@@ -62,7 +53,7 @@ subroutine loop_test
   5 continue
   enddo
 
-  ! CHECK: fir.do_loop {{.*}} unordered
+  ! CHECK: fir.do_concurrent.loop (%{{.*}}) = {{.*}}
   ! CHECK-COUNT-2: fir.if
   do concurrent (integer(2)::i=1:5, i.ne.3)
     if (i.eq.2 .or. i.eq.4) then ! fir.if
@@ -93,10 +84,6 @@ end subroutine loop_test
 
 ! CHECK-LABEL: c.func @_QPlis
 subroutine lis(n)
-  ! CHECK-DAG: fir.alloca i32 {bindc_name = "m"}
-  ! CHECK-DAG: fir.alloca i32 {bindc_name = "j"}
-  ! CHECK-DAG: fir.alloca i32 {bindc_name = "i"}
-  ! CHECK-DAG: fir.alloca i8 {bindc_name = "i"}
   ! CHECK-DAG: fir.alloca i32 {bindc_name = "j", uniq_name = "_QFlisEj"}
   ! CHECK-DAG: fir.alloca i32 {bindc_name = "k", uniq_name = "_QFlisEk"}
   ! CHECK-DAG: fir.alloca !fir.box<!fir.ptr<!fir.array<?x?x?xi32>>> {bindc_name = "p", uniq_name = "_QFlisEp"}
@@ -117,8 +104,8 @@ subroutine lis(n)
   ! CHECK:     }
   r = 0
 
-  ! CHECK:     fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %{{.*}} unordered {
-  ! CHECK:       fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg3 = %{{.*}}) -> (index, i32) {
+  ! CHECK:     fir.do_concurrent {
+  ! CHECK:       fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
   ! CHECK:       }
   ! CHECK:     }
   do concurrent (integer(kind=1)::i=n:1:-1)
@@ -128,16 +115,18 @@ subroutine lis(n)
     enddo
   enddo
 
-  ! CHECK:     fir.do_loop %arg1 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered {
-  ! CHECK:       fir.do_loop %arg2 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered {
+  ! CHECK:       fir.do_concurrent.loop (%{{.*}}, %{{.*}}) = (%{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}}) {
   ! CHECK:         fir.if %{{.*}} {
   ! CHECK:           %[[V_95:[0-9]+]] = fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "t", pinned, uniq_name = "_QFlisEt"}
   ! CHECK:           %[[V_96:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?x?xi32>>> {bindc_name = "p", pinned, uniq_name = "_QFlisEp"}
   ! CHECK:           fir.store %{{.*}} to %[[V_96]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>
   ! CHECK:           fir.do_loop %arg3 = %{{.*}} to %{{.*}} step %c1{{.*}} iter_args(%arg4 = %{{.*}}) -> (index, i32) {
-  ! CHECK:             fir.do_loop %arg5 = %{{.*}} to %{{.*}} step %c1{{.*}} unordered {
-  ! CHECK:               fir.load %[[V_96]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>
-  ! CHECK:               fir.convert %[[V_95]] : (!fir.ref<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  ! CHECK:             fir.do_concurrent {
+  ! CHECK:               fir.alloca i32 {bindc_name = "m"}
+  ! CHECK:               fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+  ! CHECK:                 fir.load %[[V_96]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>
+  ! CHECK:                 fir.convert %[[V_95]] : (!fir.ref<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  ! CHECK:               }
   ! CHECK:             }
   ! CHECK:           }
   ! CHECK:           fir.convert %[[V_95]] : (!fir.ref<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?xi32>>
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 78f39e1..84db197 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -12,9 +12,7 @@ subroutine loop_test
 
   ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"}
   ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref<i32>
-  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
-  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
-  ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
+  ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
   do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m)
     tmp = i + j + k
     sum = tmp + sum
diff --git a/flang/test/Lower/nsw.f90 b/flang/test/Lower/nsw.f90
index 4ee9e5d..2ec1efb 100644
--- a/flang/test/Lower/nsw.f90
+++ b/flang/test/Lower/nsw.f90
@@ -139,7 +139,6 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPloop_params3(
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : i32
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref<i32>) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_11:.*]] = fir.declare %{{.*}}lb"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_12:.*]] = fir.declare %{{.*}}ub"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_14:.*]] = fir.declare %{{.*}}i"} : (!fir.ref<i32>) -> !fir.ref<i32>
@@ -153,4 +152,6 @@ end subroutine
 ! CHECK:           %[[VAL_31:.*]] = fir.load %[[VAL_15]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_32:.*]] = arith.muli %[[VAL_31]], %[[VAL_4]] overflow<nsw> : i32
 ! CHECK:           %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> index
-! CHECK:           fir.do_loop %[[VAL_34:.*]] = %[[VAL_28]] to %[[VAL_30]] step %[[VAL_33]] unordered {
+! CHECK:           fir.do_concurrent {
+! CHECK:             %[[VAL_9:.*]] = fir.declare %{{.*}}i"} : (!fir.ref<i32>) -> !fir.ref<i32>
+! CHECK:             fir.do_concurrent.loop (%[[VAL_34:.*]]) = (%[[VAL_28]]) to (%[[VAL_30]]) step (%[[VAL_33]]) {
diff --git a/flang/test/Parser/OpenMP/declare-variant.f90 b/flang/test/Parser/OpenMP/declare-variant.f90
new file mode 100644
index 0000000..1b97733
--- /dev/null
+++ b/flang/test/Parser/OpenMP/declare-variant.f90
@@ -0,0 +1,104 @@
+! RUN: %flang_fc1 -fdebug-unparse-no-sema -fopenmp %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree-no-sema -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine sub0
+!CHECK: !$OMP DECLARE VARIANT (sub:vsub) MATCH(CONSTRUCT={PARALLEL})
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpDeclareVariantDirective
+!PARSE-TREE: | Verbatim
+!PARSE-TREE: | Name = 'sub'
+!PARSE-TREE: | Name = 'vsub'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Match -> OmpMatchClause -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | OmpTraitSetSelectorName -> Value = Construct
+!PARSE-TREE: | | OmpTraitSelector
+!PARSE-TREE: | | | OmpTraitSelectorName -> llvm::omp::Directive = parallel
+  !$omp declare variant (sub:vsub) match (construct={parallel})
+contains
+  subroutine vsub
+  end subroutine
+
+  subroutine sub ()
+  end subroutine
+end subroutine
+
+subroutine sb1
+  integer :: x
+  x = 1
+  !$omp dispatch device(1)
+    call sub(x)
+contains
+  subroutine vsub (v1)
+    integer, value :: v1
+  end
+  subroutine sub (v1)
+!CHECK: !$OMP DECLARE VARIANT (vsub) MATCH(CONSTRUCT={DISPATCH}
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpDeclareVariantDirective
+!PARSE-TREE: | Verbatim
+!PARSE-TREE: | Name = 'vsub'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Match -> OmpMatchClause -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | OmpTraitSetSelectorName -> Value = Construct
+!PARSE-TREE: | | OmpTraitSelector
+!PARSE-TREE: | | | OmpTraitSelectorName -> llvm::omp::Directive = dispatch
+    !$omp declare variant(vsub), match(construct={dispatch})
+    integer, value :: v1
+  end
+end subroutine
+
+subroutine sb2 (x1, x2)
+  use omp_lib, only: omp_interop_kind
+  integer :: x
+  x = 1
+  !$omp dispatch device(1)
+    call sub(x)
+contains
+  subroutine vsub (v1, a1, a2)
+    integer, value :: v1
+    integer(omp_interop_kind) :: a1
+    integer(omp_interop_kind), value :: a2
+  end
+  subroutine sub (v1)
+!CHECK: !$OMP DECLARE VARIANT (vsub) MATCH(CONSTRUCT={DISPATCH}) APPEND_ARGS(INTEROP(T&
+!CHECK: !$OMP&ARGET),INTEROP(TARGET))
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OmpDeclareVariantDirective
+!PARSE-TREE: | Verbatim
+!PARSE-TREE: | Name = 'vsub'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Match -> OmpMatchClause -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | OmpTraitSetSelectorName -> Value = Construct
+!PARSE-TREE: | | OmpTraitSelector
+!PARSE-TREE: | | | OmpTraitSelectorName -> llvm::omp::Directive = dispatch
+!PARSE-TREE: | OmpClause -> AppendArgs -> OmpAppendArgsClause -> OmpAppendOp -> OmpInteropType -> Value = Target
+!PARSE-TREE: | OmpAppendOp -> OmpInteropType -> Value = Target
+    !$omp declare variant(vsub), match(construct={dispatch}), append_args (interop(target), interop(target))
+    integer, value :: v1
+  end
+end subroutine
+
+subroutine sb3 (x1, x2)
+  use iso_c_binding, only: c_ptr
+  type(c_ptr), value :: x1, x2
+
+  !$omp dispatch device(1)
+  call sub(x1, x2)
+contains
+  subroutine sub (v1, v2)
+    type(c_ptr), value :: v1, v2
+!CHECK: !$OMP DECLARE VARIANT (vsub) MATCH(CONSTRUCT={DISPATCH}) ADJUST_ARGS(NOTHING:v&
+!CHECK: !$OMP&1) ADJUST_ARGS(NEED_DEVICE_PTR:v2)
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpDeclareVariantDirective
+!PARSE-TREE: | Verbatim
+!PARSE-TREE: | Name = 'vsub'
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Match -> OmpMatchClause -> OmpContextSelectorSpecification -> OmpTraitSetSelector
+!PARSE-TREE: | | OmpTraitSetSelectorName -> Value = Construct
+!PARSE-TREE: | | OmpTraitSelector
+!PARSE-TREE: | | | OmpTraitSelectorName -> llvm::omp::Directive = dispatch
+!PARSE-TREE: | OmpClause -> AdjustArgs -> OmpAdjustArgsClause
+!PARSE-TREE: | | OmpAdjustOp -> Value = Nothing
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'v1'
+!PARSE-TREE: | OmpClause -> AdjustArgs -> OmpAdjustArgsClause
+!PARSE-TREE: | | OmpAdjustOp -> Value = Need_Device_Ptr
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'v2'
+    !$omp declare variant(vsub) match ( construct = { dispatch } ) adjust_args(nothing : v1 ) adjust_args(need_device_ptr : v2)
+  end
+  subroutine vsub(v1, v2)
+    type(c_ptr), value :: v1, v2
+  end
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/declare-variant.f90 b/flang/test/Semantics/OpenMP/declare-variant.f90
new file mode 100644
index 0000000..84a0cdc
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-variant.f90
@@ -0,0 +1,14 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+subroutine sub0
+!ERROR: Implicit subroutine declaration 'vsub1' in !$OMP DECLARE VARIANT
+  !$omp declare variant (sub:vsub1) match (construct={parallel})
+!ERROR: Implicit subroutine declaration 'sub1' in !$OMP DECLARE VARIANT
+  !$omp declare variant (sub1:vsub) match (construct={parallel})
+contains
+  subroutine vsub
+  end subroutine
+
+  subroutine sub ()
+  end subroutine
+end subroutine
diff --git a/flang/test/Semantics/cuf20.cuf b/flang/test/Semantics/cuf20.cuf
new file mode 100644
index 0000000..222ff2a
--- /dev/null
+++ b/flang/test/Semantics/cuf20.cuf
@@ -0,0 +1,42 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+
+! Test case 1: Device arrays with ignore_tkr(c)
+subroutine test_device_arrays()
+  interface bar
+    subroutine bar1(a)
+!dir$ ignore_tkr(c) a
+      real :: a(..)
+!@cuf attributes(device) :: a
+    end subroutine
+  end interface
+
+  integer :: n = 10, k = 2
+  real, device :: a(10), b(10), c(10)
+  
+  call bar(a(1:n))     ! Should not warn about contiguity
+  call bar(b(1:n:k))   ! Should not warn about contiguity
+  call bar(c(1:n:2))   ! Should not warn about contiguity
+end subroutine
+
+! Test case 2: Managed arrays with ignore_tkr(c)
+subroutine test_managed_arrays()
+  interface bar
+    subroutine bar1(a)
+!dir$ ignore_tkr(c) a
+      real :: a(..)
+!@cuf attributes(device) :: a
+    end subroutine
+  end interface
+
+  integer :: n = 10, k = 2
+  real, managed :: a(10), b(10), c(10)
+  
+  call bar(a(1:n))     ! Should not warn about contiguity
+  call bar(b(1:n:k))   ! Should not warn about contiguity
+  call bar(c(1:n:2))   ! Should not warn about contiguity
+end subroutine
+
+program main
+  call test_device_arrays()
+  call test_managed_arrays()
+end program 
+\ No newline at end of file
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index 12f6303..b84d448 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -1,3 +1,6 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+! XFAIL: *
+
 ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
index f826966..4e13c09 100644
--- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
+++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
@@ -1,3 +1,6 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+! XFAIL: *
+
 ! Tests that "loop-local values" are properly handled by localizing them to the
 ! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue`
 ! for a definition of "loop-local values" and how they are handled.
diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
index 32bed61..adc4a48 100644
--- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
+++ b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
@@ -1,3 +1,6 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+! XFAIL: *
+
 ! Tests loop-nest detection algorithm for do-concurrent mapping.
 
 ! REQUIRES: asserts
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index d021072..2680067 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -1,3 +1,6 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+! XFAIL: *
+
 ! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
 
 ! RUN: split-file %s %t
diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
index cd1bd4f..23a3aae 100644
--- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
+++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90
@@ -1,3 +1,6 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+! XFAIL: *
+
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
 ! RUN:   | FileCheck %s
 
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index 184fdfe..d1c0210 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -1,3 +1,6 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+! XFAIL: *
+
 ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
 ! we skip converting the not-perfectly nested `do concurrent` loop.
 
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index d2fc128..30d9d00 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -451,7 +451,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.exp
     libc.src.math.exp10
     libc.src.math.exp10f
-    # libc.src.math.exp10m1f
+    libc.src.math.exp10m1f
     libc.src.math.exp2
     libc.src.math.exp2f
     libc.src.math.exp2m1f
diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp
index cc96032..aee2733 100644
--- a/libc/test/src/math/exp10m1f_test.cpp
+++ b/libc/test/src/math/exp10m1f_test.cpp
@@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp10m1fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_inf_or_nan())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::exp10m1f(x);
@@ -89,7 +89,7 @@ TEST_F(LlvmLibcExp10m1fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_inf_or_nan() || LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
                                    LIBC_NAMESPACE::exp10m1f(x), 0.5);
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index a7382c5..c571dd6 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -80,16 +80,18 @@ Deprecations and Removals
 - The ``_LIBCPP_VERBOSE_ABORT_NOT_NOEXCEPT`` has been removed, making ``std::__libcpp_verbose_abort``
   unconditionally ``noexcept``.
 
+- TODO: The non-conforming extension ``packaged_task::result_type`` has been removed in LLVM 21.
+
 Potentially breaking changes
 ----------------------------
 
 - The implementation of ``num_put::do_put`` has been replaced to improve the performance, which can lead to different
   output when printing pointers.
 
-Upcoming Deprecations and Removals
-----------------------------------
+Announcements About Future Releases
+-----------------------------------
 
-LLVM 21
+LLVM 22
 ~~~~~~~
 
 - The status of the C++03 implementation will be frozen after the LLVM 21 release. This means that starting in LLVM 22,
@@ -101,13 +103,6 @@ LLVM 21
   If you are using C++03 in your project, you should consider moving to a newer version of the Standard to get the most
   out of libc++.
 
-- Non-conforming extension ``packaged_task::result_type`` will be removed in LLVM 21.
-
-LLVM 22
-~~~~~~~
-
-- TODO
-
 
 ABI Affecting Changes
 ---------------------
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 0ff1e25..3809446 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -76,9 +76,9 @@
 "`P2422R1 <https://wg21.link/P2422R1>`__","Remove ``nodiscard`` annotations from the standard library specification","2024-06 (St. Louis)","|Complete|","19","``nodiscard`` attributes were kept as a conforming extension"
 "`P2300R10 <https://wg21.link/P2300R10>`__","``std::execution``","2024-06 (St. Louis)","","",""
 "","","","","",""
-"`P3136R1 <https://wg21.link/P3136R1>`__","Retiring niebloids","2024-11 (Wrocław)","","",""
+"`P3136R1 <https://wg21.link/P3136R1>`__","Retiring niebloids","2024-11 (Wrocław)","|Complete|","14",""
 "`P3138R5 <https://wg21.link/P3138R5>`__","``views::cache_latest``","2024-11 (Wrocław)","","",""
-"`P3379R0 <https://wg21.link/P3379R0>`__","Constrain ``std::expected`` equality operators","2024-11 (Wrocław)","","",""
+"`P3379R0 <https://wg21.link/P3379R0>`__","Constrain ``std::expected`` equality operators","2024-11 (Wrocław)","|Complete|","21",""
 "`P2862R1 <https://wg21.link/P2862R1>`__","``text_encoding::name()`` should never return null values","2024-11 (Wrocław)","","",""
 "`P2897R7 <https://wg21.link/P2897R7>`__","``aligned_accessor``: An ``mdspan`` accessor expressing pointer over-alignment","2024-11 (Wrocław)","|Complete|","21",""
 "`P3355R1 <https://wg21.link/P3355R1>`__","Fix ``submdspan`` for C++26","2024-11 (Wrocław)","","",""
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 03bbd16..6b3d335 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -25,6 +25,7 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
+#include <__type_traits/is_core_convertible.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
@@ -1139,8 +1140,15 @@ public:
 
   // [expected.object.eq], equality operators
   template <class _T2, class _E2>
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y)
     requires(!is_void_v<_T2>)
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y) {
+#  if _LIBCPP_STD_VER >= 26
+            && requires {
+                 { *__x == *__y } -> __core_convertible_to<bool>;
+                 { __x.error() == __y.error() } -> __core_convertible_to<bool>;
+               }
+#  endif
+  {
     if (__x.__has_val() != __y.__has_val()) {
       return false;
     } else {
@@ -1153,12 +1161,24 @@ public:
   }
 
   template <class _T2>
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const _T2& __v) {
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const _T2& __v)
+#  if _LIBCPP_STD_VER >= 26
+    requires(!__is_std_expected<_T2>::value) && requires {
+      { *__x == __v } -> __core_convertible_to<bool>;
+    }
+#  endif
+  {
     return __x.__has_val() && static_cast<bool>(__x.__val() == __v);
   }
 
   template <class _E2>
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e) {
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e)
+#  if _LIBCPP_STD_VER >= 26
+    requires requires {
+      { __x.error() == __e.error() } -> __core_convertible_to<bool>;
+    }
+#  endif
+  {
     return !__x.__has_val() && static_cast<bool>(__x.__unex() == __e.error());
   }
 };
@@ -1851,7 +1871,13 @@ public:
   // [expected.void.eq], equality operators
   template <class _T2, class _E2>
     requires is_void_v<_T2>
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y) {
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y)
+#  if _LIBCPP_STD_VER >= 26
+    requires requires {
+      { __x.error() == __y.error() } -> __core_convertible_to<bool>;
+    }
+#  endif
+  {
     if (__x.__has_val() != __y.__has_val()) {
       return false;
     } else {
@@ -1860,7 +1886,13 @@ public:
   }
 
   template <class _E2>
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y) {
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y)
+#  if _LIBCPP_STD_VER >= 26
+    requires requires {
+      { __x.error() == __y.error() } -> __core_convertible_to<bool>;
+    }
+#  endif
+  {
     return !__x.__has_val() && static_cast<bool>(__x.__unex() == __y.error());
   }
 };
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index f5abfd0..f5e9756 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -114,7 +114,7 @@ public:
 
   class value_compare {
   private:
-    key_compare __comp_;
+    _LIBCPP_NO_UNIQUE_ADDRESS key_compare __comp_;
     _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {}
     friend flat_map;
 
diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h
index ea77fb5..15fcd79 100644
--- a/libcxx/include/__flat_map/flat_multimap.h
+++ b/libcxx/include/__flat_map/flat_multimap.h
@@ -115,7 +115,7 @@ public:
 
   class value_compare {
   private:
-    key_compare __comp_;
+    _LIBCPP_NO_UNIQUE_ADDRESS key_compare __comp_;
     _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {}
     friend flat_multimap;
 
diff --git a/libcxx/include/__iterator/advance.h b/libcxx/include/__iterator/advance.h
index f1a8d28..c7d3c1f 100644
--- a/libcxx/include/__iterator/advance.h
+++ b/libcxx/include/__iterator/advance.h
@@ -66,7 +66,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void advance(_InputIter& __i
   typedef typename iterator_traits<_InputIter>::difference_type _Difference;
   _Difference __n = static_cast<_Difference>(std::__convert_to_integral(__orig_n));
   _LIBCPP_ASSERT_PEDANTIC(__has_bidirectional_iterator_category<_InputIter>::value || __n >= 0,
-                          "Attempt to advance(it, n) with negative n on a non-bidirectional iterator");
+                          "std::advance: Can only pass a negative `n` with a bidirectional_iterator.");
   std::__advance(__i, __n, typename iterator_traits<_InputIter>::iterator_category());
 }
 
diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h
index 60ec186..86c9247 100644
--- a/libcxx/include/__string/char_traits.h
+++ b/libcxx/include/__string/char_traits.h
@@ -132,8 +132,6 @@ struct char_traits<char> {
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT {
-    if (__n == 0)
-      return nullptr;
     return std::__constexpr_memchr(__s, __a, __n);
   }
 
@@ -250,8 +248,6 @@ struct char_traits<wchar_t> : __char_traits_base<wchar_t, wint_t, static_cast<wi
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT {
-    if (__n == 0)
-      return nullptr;
     return std::__constexpr_wmemchr(__s, __a, __n);
   }
 };
@@ -352,7 +348,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits<char32_t>::length(const
 template <class _CharT, class _SizeT, class _Traits, _SizeT __npos>
 inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI
 __str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT {
-  if (__pos >= __sz)
+  if (__pos > __sz)
     return __npos;
   const _CharT* __r = _Traits::find(__p + __pos, __sz - __pos, __c);
   if (__r == nullptr)
diff --git a/libcxx/include/__type_traits/is_core_convertible.h b/libcxx/include/__type_traits/is_core_convertible.h
index 93e23d2..ca3a346 100644
--- a/libcxx/include/__type_traits/is_core_convertible.h
+++ b/libcxx/include/__type_traits/is_core_convertible.h
@@ -30,6 +30,13 @@ template <class _Tp, class _Up>
 struct __is_core_convertible<_Tp, _Up, decltype(static_cast<void (*)(_Up)>(0)(static_cast<_Tp (*)()>(0)()))>
     : true_type {};
 
+#if _LIBCPP_STD_VER >= 20
+
+template <class _Tp, class _Up>
+concept __core_convertible_to = __is_core_convertible<_Tp, _Up>::value;
+
+#endif // _LIBCPP_STD_VER >= 20
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_CORE_CONVERTIBLE_H
diff --git a/libcxx/test/libcxx/iterators/assert.advance.pass.cpp b/libcxx/test/libcxx/iterators/assert.advance.pass.cpp
index e9d2f27..a7e8878 100644
--- a/libcxx/test/libcxx/iterators/assert.advance.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.advance.pass.cpp
@@ -31,7 +31,7 @@ int main(int, char**) {
     forward_iterator<int *> it(a+1);
     std::advance(it, 1);  // should work fine
     std::advance(it, 0);  // should work fine
-    TEST_LIBCPP_ASSERT_FAILURE(std::advance(it, -1), "Attempt to advance(it, n) with negative n on a non-bidirectional iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(std::advance(it, -1), "std::advance: Can only pass a negative `n` with a bidirectional_iterator.");
 
     return 0;
 }
diff --git a/libcxx/test/libcxx/iterators/assert.next.pass.cpp b/libcxx/test/libcxx/iterators/assert.next.pass.cpp
index 1e86723..2e0296b 100644
--- a/libcxx/test/libcxx/iterators/assert.next.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.next.pass.cpp
@@ -25,7 +25,7 @@ int main(int, char**) {
     forward_iterator<int *> it(a+1);
     (void)std::next(it, 1); // should work fine
     (void)std::next(it, 0); // should work fine
-    TEST_LIBCPP_ASSERT_FAILURE(std::next(it, -1), "Attempt to advance(it, n) with negative n on a non-bidirectional iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(std::next(it, -1), "std::advance: Can only pass a negative `n` with a bidirectional_iterator.");
 
     return 0;
 }
diff --git a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
index 29b8d6e..deac1ed 100644
--- a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
@@ -31,7 +31,7 @@ int main(int, char**) {
     forward_iterator<int *> it(a+1);
     (void)std::prev(it, -1); // should work fine
     (void)std::prev(it, 0);  // should work fine
-    TEST_LIBCPP_ASSERT_FAILURE(std::prev(it, 1), "Attempt to advance(it, n) with negative n on a non-bidirectional iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(std::prev(it, 1), "std::advance: Can only pass a negative `n` with a bidirectional_iterator.");
 
     return 0;
 }
diff --git a/libcxx/test/std/algorithms/ranges_robust_against_omitting_invoke.pass.cpp b/libcxx/test/std/algorithms/ranges_robust_against_omitting_invoke.pass.cpp
index ca87f6e..a975031 100644
--- a/libcxx/test/std/algorithms/ranges_robust_against_omitting_invoke.pass.cpp
+++ b/libcxx/test/std/algorithms/ranges_robust_against_omitting_invoke.pass.cpp
@@ -35,7 +35,7 @@ struct Bar {
   Bar create() const { return Bar(); }
 };
 
-// Invokes both the (iterator, sentinel, ...) and the (range, ...) overloads of the given niebloid.
+// Invokes both the (iterator, sentinel, ...) and the (range, ...) overloads of the given algorithm function object.
 
 // (in, ...)
 template <class Func, std::ranges::range Input, class... Args>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/close.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/close.pass.cpp
index e0338e6..43233de 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/close.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/close.pass.cpp
@@ -10,11 +10,6 @@
 
 // basic_filebuf<charT,traits>* close();
 
-// This test closes an fd that belongs to a std::filebuf, and Bionic's fdsan
-// detects this and aborts the process, starting in Android R (API 30).
-// See D137129.
-// XFAIL: LIBCXX-ANDROID-FIXME && !android-device-api={{2[1-9]}}
-
 #include <fstream>
 #include <cassert>
 #if defined(__unix__)
@@ -37,7 +32,10 @@ int main(int, char**)
         assert(f.close() == nullptr);
         assert(!f.is_open());
     }
-#if defined(__unix__)
+    // Starting with Android API 30+, Bionic's fdsan aborts a process that calls
+    // close() on a file descriptor tagged as belonging to something else (such
+    // as a FILE*).
+#if defined(__unix__) && !defined(__BIONIC__)
     {
         std::filebuf f;
         assert(!f.is_open());
diff --git a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
index 402bc1c..dc9134f 100644
--- a/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
+++ b/libcxx/test/std/library/description/conventions/customization.point.object/niebloid.compile.pass.cpp
@@ -9,6 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: stdlib=libc++
 
+// [alg.func.obj]
 // [algorithms.requirements]/2
 // [range.iter.ops.general]/2
 
@@ -24,12 +25,14 @@
 
 #include "test_macros.h"
 
-// Niebloids, unlike CPOs, are *not* required to be semiregular or even to have
-// a declared type at all; they are specified as "magic" overload sets whose
-// names are not found by argument-dependent lookup and which inhibit
-// argument-dependent lookup if they are found via a `using`-declaration.
+// Before P3136R1, niebloids were pedantically not CPOs, and they were *not* required to be semiregular or
+// even to have a declared type at all; they were specified as "magic" overload sets
+// whose names are not found by argument-dependent lookup and
+// which inhibit argument-dependent lookup if they are found via a `using`-declaration.
 //
-// libc++ implements them using the same function-object technique we use for CPOs;
+// As of P3136R1, niebloids (formally known as algorithm function objects) are required to be CPOs.
+//
+// libc++ implements niebloids in the same way as CPOs since LLVM 14;
 // therefore this file should stay in sync with ./cpo.compile.pass.cpp.
 
 template <class CPO, class... Args>
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
index bc8b9de..25eb97a 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
@@ -17,18 +17,19 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
-struct Data {
-  int i;
-  constexpr Data(int ii) : i(ii) {}
-
-  friend constexpr bool operator==(const Data& data, int ii) { return data.i == ii; }
-};
+#if TEST_STD_VER >= 26
+// https://wg21.link/P3379R0
+static_assert(CanCompare<std::expected<int, int>, int>);
+static_assert(CanCompare<std::expected<int, int>, EqualityComparable>);
+static_assert(!CanCompare<std::expected<int, int>, NonComparable>);
+#endif
 
 constexpr bool test() {
   // x.has_value()
   {
-    const std::expected<Data, int> e1(std::in_place, 5);
+    const std::expected<EqualityComparable, int> e1(std::in_place, 5);
     int i2 = 10;
     int i3 = 5;
     assert(e1 != i2);
@@ -37,7 +38,7 @@ constexpr bool test() {
 
   // !x.has_value()
   {
-    const std::expected<Data, int> e1(std::unexpect, 5);
+    const std::expected<EqualityComparable, int> e1(std::unexpect, 5);
     int i2 = 10;
     int i3 = 5;
     assert(e1 != i2);
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
index 9325c6c..f0f549b 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
@@ -18,20 +18,26 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test constraint
-template <class T1, class T2>
-concept CanCompare = requires(T1 t1, T2 t2) { t1 == t2; };
-
-struct Foo{};
-static_assert(!CanCompare<Foo, Foo>);
+static_assert(!CanCompare<NonComparable, NonComparable>);
 
 static_assert(CanCompare<std::expected<int, int>, std::expected<int, int>>);
 static_assert(CanCompare<std::expected<int, int>, std::expected<short, short>>);
 
-// Note this is true because other overloads are unconstrained
-static_assert(CanCompare<std::expected<int, int>, std::expected<void, int>>);
-
+#if TEST_STD_VER >= 26
+// https://wg21.link/P3379R0
+static_assert(!CanCompare<std::expected<int, int>, std::expected<void, int>>);
+static_assert(CanCompare<std::expected<int, int>, std::expected<int, int>>);
+static_assert(!CanCompare<std::expected<NonComparable, int>, std::expected<NonComparable, int>>);
+static_assert(!CanCompare<std::expected<int, NonComparable>, std::expected<int, NonComparable>>);
+static_assert(!CanCompare<std::expected<NonComparable, int>, std::expected<int, NonComparable>>);
+static_assert(!CanCompare<std::expected<int, NonComparable>, std::expected<NonComparable, int>>);
+#else
+// Note this is true because other overloads in expected<non-void> are unconstrained
+static_assert(CanCompare<std::expected<void, int>, std::expected<int, int>>);
+#endif
 constexpr bool test() {
   // x.has_value() && y.has_value()
   {
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
index a8c469d..6c7d2f3 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
@@ -17,18 +17,19 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
-struct Data {
-  int i;
-  constexpr Data(int ii) : i(ii) {}
-
-  friend constexpr bool operator==(const Data& data, int ii) { return data.i == ii; }
-};
+#if TEST_STD_VER >= 26
+// https://wg21.link/P3379R0
+static_assert(CanCompare<std::expected<EqualityComparable, EqualityComparable>, std::unexpected<int>>);
+static_assert(CanCompare<std::expected<EqualityComparable, int>, std::unexpected<EqualityComparable>>);
+static_assert(!CanCompare<std::expected<EqualityComparable, NonComparable>, std::unexpected<int>>);
+#endif
 
 constexpr bool test() {
   // x.has_value()
   {
-    const std::expected<Data, Data> e1(std::in_place, 5);
+    const std::expected<EqualityComparable, EqualityComparable> e1(std::in_place, 5);
     std::unexpected<int> un2(10);
     std::unexpected<int> un3(5);
     assert(e1 != un2);
@@ -37,7 +38,7 @@ constexpr bool test() {
 
   // !x.has_value()
   {
-    const std::expected<Data, Data> e1(std::unexpect, 5);
+    const std::expected<EqualityComparable, EqualityComparable> e1(std::unexpect, 5);
     std::unexpected<int> un2(10);
     std::unexpected<int> un3(5);
     assert(e1 != un2);
diff --git a/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
index 8b24875..b6c3d8d 100644
--- a/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
@@ -18,10 +18,7 @@
 #include <utility>
 
 #include "test_macros.h"
-
-// Test constraint
-template <class T1, class T2>
-concept CanCompare = requires(T1 t1, T2 t2) { t1 == t2; };
+#include "../../types.h"
 
 struct Foo{};
 static_assert(!CanCompare<Foo, Foo>);
@@ -29,8 +26,18 @@ static_assert(!CanCompare<Foo, Foo>);
 static_assert(CanCompare<std::expected<void, int>, std::expected<void, int>>);
 static_assert(CanCompare<std::expected<void, int>, std::expected<void, short>>);
 
+#if TEST_STD_VER >= 26
+// https://wg21.link/P3379R0
+static_assert(!CanCompare<std::expected<void, int>, std::expected<int, int>>);
+static_assert(CanCompare<std::expected<void, int>, std::expected<void, int>>);
+static_assert(CanCompare<std::expected<void, int>, std::expected<void, int>>);
+static_assert(!CanCompare<std::expected<void, NonComparable>, std::expected<void, NonComparable>>);
+static_assert(!CanCompare<std::expected<void, int>, std::expected<void, NonComparable>>);
+static_assert(!CanCompare<std::expected<void, NonComparable>, std::expected<void, int>>);
+#else
 // Note this is true because other overloads in expected<non-void> are unconstrained
 static_assert(CanCompare<std::expected<void, int>, std::expected<int, int>>);
+#endif
 
 constexpr bool test() {
   // x.has_value() && y.has_value()
diff --git a/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
index 4500971..f37f38b 100644
--- a/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
@@ -17,18 +17,19 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
-struct Data {
-  int i;
-  constexpr Data(int ii) : i(ii) {}
-
-  friend constexpr bool operator==(const Data& data, int ii) { return data.i == ii; }
-};
+#if TEST_STD_VER >= 26
+// https://wg21.link/P3379R0
+static_assert(CanCompare<std::expected<void, EqualityComparable>, std::unexpected<int>>);
+static_assert(CanCompare<std::expected<void, int>, std::unexpected<EqualityComparable>>);
+static_assert(!CanCompare<std::expected<void, NonComparable>, std::unexpected<int>>);
+#endif
 
 constexpr bool test() {
   // x.has_value()
   {
-    const std::expected<void, Data> e1;
+    const std::expected<void, EqualityComparable> e1;
     std::unexpected<int> un2(10);
     std::unexpected<int> un3(5);
     assert(e1 != un2);
@@ -37,7 +38,7 @@ constexpr bool test() {
 
   // !x.has_value()
   {
-    const std::expected<void, Data> e1(std::unexpect, 5);
+    const std::expected<void, EqualityComparable> e1(std::unexpect, 5);
     std::unexpected<int> un2(10);
     std::unexpected<int> un3(5);
     assert(e1 != un2);
diff --git a/libcxx/test/std/utilities/expected/types.h b/libcxx/test/std/utilities/expected/types.h
index df73ebd..11473ca 100644
--- a/libcxx/test/std/utilities/expected/types.h
+++ b/libcxx/test/std/utilities/expected/types.h
@@ -336,4 +336,17 @@ struct CheckForInvalidWrites : public CheckForInvalidWritesBase<WithPaddedExpect
   }
 };
 
+struct NonComparable {};
+
+struct EqualityComparable {
+  int i;
+  constexpr EqualityComparable(int ii) : i(ii) {}
+
+  friend constexpr bool operator==(const EqualityComparable& data, int ii) { return data.i == ii; }
+};
+
+// Test constraint
+template <class T1, class T2>
+concept CanCompare = requires(T1 t1, T2 t2) { t1 == t2; };
+
 #endif // TEST_STD_UTILITIES_EXPECTED_TYPES_H
diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt
index 55d7599..9b202cc 100644
--- a/lld/CMakeLists.txt
+++ b/lld/CMakeLists.txt
@@ -63,6 +63,9 @@ if(LLD_BUILT_STANDALONE)
     if(EXISTS ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
       # Note: path not really used, except for checking if lit was found
       set(LLVM_EXTERNAL_LIT ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
+      if(EXISTS ${LLVM_MAIN_SRC_DIR}/utils/llvm-lit)
+        add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/llvm-lit utils/llvm-lit)
+      endif()
       if(NOT LLVM_UTILS_PROVIDED)
         add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/FileCheck utils/FileCheck)
         add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/not utils/not)
diff --git a/lld/test/wasm/lto/signature-mismatch.ll b/lld/test/wasm/lto/signature-mismatch.ll
index cf1a998..6580c8c 100644
--- a/lld/test/wasm/lto/signature-mismatch.ll
+++ b/lld/test/wasm/lto/signature-mismatch.ll
@@ -17,4 +17,4 @@ define void @_start() {
 
 ; CHECK: error: function signature mismatch: f
 ; CHECK: >>> defined as (i32) -> void in {{.*}}signature-mismatch.ll.tmp1.o
-; CHECK: >>> defined as () -> void in lto.tmp
+; CHECK: >>> defined as () -> void in {{.*}}signature-mismatch.ll.tmp.wasm.lto.o
diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp
index ab63281..a877f067 100644
--- a/lld/wasm/LTO.cpp
+++ b/lld/wasm/LTO.cpp
@@ -183,10 +183,11 @@ static void thinLTOCreateEmptyIndexFiles() {
 
 // Merge all the bitcode files we have seen, codegen the result
 // and return the resulting objects.
-std::vector<StringRef> BitcodeCompiler::compile() {
+SmallVector<InputFile *, 0> BitcodeCompiler::compile() {
   unsigned maxTasks = ltoObj->getMaxTasks();
   buf.resize(maxTasks);
   files.resize(maxTasks);
+  filenames.resize(maxTasks);
 
   // The --thinlto-cache-dir option specifies the path to a directory in which
   // to cache native object files for ThinLTO incremental builds. If a path was
@@ -233,15 +234,21 @@ std::vector<StringRef> BitcodeCompiler::compile() {
   if (!ctx.arg.thinLTOCacheDir.empty())
     pruneCache(ctx.arg.thinLTOCacheDir, ctx.arg.thinLTOCachePolicy, files);
 
-  std::vector<StringRef> ret;
+  SmallVector<InputFile *, 0> ret;
   for (unsigned i = 0; i != maxTasks; ++i) {
     StringRef objBuf = buf[i].second;
     StringRef bitcodeFilePath = buf[i].first;
+    if (files[i]) {
+      // When files[i] is not null, we get the native relocatable file from the
+      // cache. filenames[i] contains the original BitcodeFile's identifier.
+      objBuf = files[i]->getBuffer();
+      bitcodeFilePath = filenames[i];
+    } else {
+      objBuf = buf[i].second;
+      bitcodeFilePath = buf[i].first;
+    }
     if (objBuf.empty())
       continue;
-    ret.emplace_back(objBuf.data(), objBuf.size());
-    if (!ctx.arg.saveTemps)
-      continue;
 
     // If the input bitcode file is path/to/x.o and -o specifies a.out, the
     // corresponding native relocatable file path will look like:
@@ -266,7 +273,9 @@ std::vector<StringRef> BitcodeCompiler::compile() {
       sys::path::remove_dots(path, true);
       ltoObjName = saver().save(path.str());
     }
-    saveBuffer(objBuf, ltoObjName);
+    if (ctx.arg.saveTemps)
+      saveBuffer(objBuf, ltoObjName);
+    ret.emplace_back(createObjectFile(MemoryBufferRef(objBuf, ltoObjName)));
   }
 
   if (!ctx.arg.ltoObjPath.empty()) {
@@ -275,10 +284,6 @@ std::vector<StringRef> BitcodeCompiler::compile() {
       saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i));
   }
 
-  for (std::unique_ptr<MemoryBuffer> &file : files)
-    if (file)
-      ret.push_back(file->getBuffer());
-
   return ret;
 }
 
diff --git a/lld/wasm/LTO.h b/lld/wasm/LTO.h
index 43c7672..21b1d59 100644
--- a/lld/wasm/LTO.h
+++ b/lld/wasm/LTO.h
@@ -45,13 +45,14 @@ public:
   ~BitcodeCompiler();
 
   void add(BitcodeFile &f);
-  std::vector<StringRef> compile();
+  SmallVector<InputFile *, 0> compile();
 
 private:
   std::unique_ptr<llvm::lto::LTO> ltoObj;
   // An array of (module name, native relocatable file content) pairs.
   SmallVector<std::pair<std::string, SmallString<0>>, 0> buf;
   std::vector<std::unique_ptr<MemoryBuffer>> files;
+  SmallVector<std::string, 0> filenames;
   std::unique_ptr<llvm::raw_fd_ostream> indexFile;
   llvm::DenseSet<StringRef> thinIndices;
 };
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index bbe48b0..91677b3 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -87,8 +87,8 @@ void SymbolTable::compileBitcodeFiles() {
   for (BitcodeFile *f : ctx.bitcodeFiles)
     lto->add(*f);
 
-  for (StringRef filename : lto->compile()) {
-    auto *obj = make<ObjFile>(MemoryBufferRef(filename, "lto.tmp"), "");
+  for (auto &file : lto->compile()) {
+    auto *obj = cast<ObjFile>(file);
     obj->parse(true);
     ctx.objectFiles.push_back(obj);
   }
diff --git a/lldb/docs/resources/debugging.rst b/lldb/docs/resources/debugging.rst
index 990a95f54..ba23759 100644
--- a/lldb/docs/resources/debugging.rst
+++ b/lldb/docs/resources/debugging.rst
@@ -236,7 +236,7 @@ in. For example, to find a process that acted as a ``gdbserver`` instance::
 Remote Debugging
 ----------------
 
-If you want to debug part of LLDB running on a remote machine, the principals
+If you want to debug part of LLDB running on a remote machine, the principles
 are the same but we will have to start debug servers, then attach debuggers to
 those servers.
 
diff --git a/lldb/include/lldb/Symbol/Block.h b/lldb/include/lldb/Symbol/Block.h
index 501c912..6018956 100644
--- a/lldb/include/lldb/Symbol/Block.h
+++ b/lldb/include/lldb/Symbol/Block.h
@@ -354,7 +354,12 @@ protected:
   // Member variables.
   SymbolContextScope &m_parent_scope;
   collection m_children;
+
+  /// Address ranges of this block. They are relative to the function entry
+  /// point so one must add/subtract GetFunction().GetAddress().GetFileAddress()
+  /// when converting from/to to the AddressRange representation.
   RangeList m_ranges;
+
   lldb::InlineFunctionInfoSP m_inlineInfoSP; ///< Inlined function information.
   lldb::VariableListSP m_variable_list_sp; ///< The variable list for all local,
                                            ///static and parameter variables
diff --git a/lldb/include/lldb/Symbol/DWARFCallFrameInfo.h b/lldb/include/lldb/Symbol/DWARFCallFrameInfo.h
index 679f652..c214ed1 100644
--- a/lldb/include/lldb/Symbol/DWARFCallFrameInfo.h
+++ b/lldb/include/lldb/Symbol/DWARFCallFrameInfo.h
@@ -47,12 +47,15 @@ public:
   /// Return an UnwindPlan based on the call frame information encoded in the
   /// FDE of this DWARFCallFrameInfo section. The returned plan will be valid
   /// (at least) for the given address.
-  bool GetUnwindPlan(const Address &addr, UnwindPlan &unwind_plan);
+  std::unique_ptr<UnwindPlan> GetUnwindPlan(const Address &addr);
 
   /// Return an UnwindPlan based on the call frame information encoded in the
   /// FDE of this DWARFCallFrameInfo section. The returned plan will be valid
-  /// (at least) for some address in the given range.
-  bool GetUnwindPlan(const AddressRange &range, UnwindPlan &unwind_plan);
+  /// (at least) for some address in the given ranges. If no unwind information
+  /// is found, nullptr is returned. \a addr represents the entry point of the
+  /// function. It corresponds to the offset zero in the returned UnwindPlan.
+  std::unique_ptr<UnwindPlan> GetUnwindPlan(llvm::ArrayRef<AddressRange> ranges,
+                                            const Address &addr);
 
   typedef RangeVector<lldb::addr_t, uint32_t> FunctionAddressAndSizeVector;
 
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index 73f27dc..0d4e11b 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -1158,6 +1158,11 @@ public:
                                      Status &error,
                                      bool force_live_memory = false);
 
+  int64_t ReadSignedIntegerFromMemory(const Address &addr,
+                                      size_t integer_byte_size,
+                                      int64_t fail_value, Status &error,
+                                      bool force_live_memory = false);
+
   uint64_t ReadUnsignedIntegerFromMemory(const Address &addr,
                                          size_t integer_byte_size,
                                          uint64_t fail_value, Status &error,
diff --git a/lldb/include/lldb/Utility/ProcessInfo.h b/lldb/include/lldb/Utility/ProcessInfo.h
index 78ade4b..24041faa 100644
--- a/lldb/include/lldb/Utility/ProcessInfo.h
+++ b/lldb/include/lldb/Utility/ProcessInfo.h
@@ -247,6 +247,11 @@ public:
 
   std::optional<bool> IsZombie() const { return m_zombie; }
 
+  // proc/../status specifies CoreDumping as the field
+  // so we match the case here.
+  void SetIsCoreDumping(bool is_coredumping) { m_coredumping = is_coredumping; }
+  std::optional<bool> IsCoreDumping() const { return m_coredumping; }
+
   void Dump(Stream &s, UserIDResolver &resolver) const;
 
   static void DumpTableHeader(Stream &s, bool show_args, bool verbose);
@@ -266,6 +271,7 @@ protected:
   struct timespec m_cumulative_system_time;
   std::optional<int8_t> m_priority_value = std::nullopt;
   std::optional<bool> m_zombie = std::nullopt;
+  std::optional<bool> m_coredumping = std::nullopt;
 };
 
 typedef std::vector<ProcessInstanceInfo> ProcessInstanceInfoList;
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 958c726..c5a7eb7 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -349,6 +349,8 @@ class DAPTestCaseBase(TestBase):
         expectFailure=False,
         gdbRemotePort=None,
         gdbRemoteHostname=None,
+        sourceBreakpoints=None,
+        functionBreakpoints=None,
     ):
         """Build the default Makefile target, create the DAP debug adapter,
         and attach to the process.
@@ -366,6 +368,26 @@ class DAPTestCaseBase(TestBase):
         # Initialize and launch the program
         self.dap_server.request_initialize(sourceInitFile)
         self.dap_server.wait_for_event("initialized")
+
+        # Set source breakpoints as part of the launch sequence.
+        if sourceBreakpoints:
+            for source_path, lines in sourceBreakpoints:
+                response = self.dap_server.request_setBreakpoints(source_path, lines)
+                self.assertTrue(
+                    response["success"],
+                    "setBreakpoints failed (%s)" % (response),
+                )
+
+        # Set function breakpoints as part of the launch sequence.
+        if functionBreakpoints:
+            response = self.dap_server.request_setFunctionBreakpoints(
+                functionBreakpoints
+            )
+            self.assertTrue(
+                response["success"],
+                "setFunctionBreakpoint failed (%s)" % (response),
+            )
+
         self.dap_server.request_configurationDone()
         response = self.dap_server.request_attach(
             program=program,
@@ -423,6 +445,8 @@ class DAPTestCaseBase(TestBase):
         commandEscapePrefix=None,
         customFrameFormat=None,
         customThreadFormat=None,
+        sourceBreakpoints=None,
+        functionBreakpoints=None,
     ):
         """Sending launch request to dap"""
 
@@ -439,6 +463,26 @@ class DAPTestCaseBase(TestBase):
         # Initialize and launch the program
         self.dap_server.request_initialize(sourceInitFile)
         self.dap_server.wait_for_event("initialized")
+
+        # Set source breakpoints as part of the launch sequence.
+        if sourceBreakpoints:
+            for source_path, lines in sourceBreakpoints:
+                response = self.dap_server.request_setBreakpoints(source_path, lines)
+                self.assertTrue(
+                    response["success"],
+                    "setBreakpoints failed (%s)" % (response),
+                )
+
+        # Set function breakpoints as part of the launch sequence.
+        if functionBreakpoints:
+            response = self.dap_server.request_setFunctionBreakpoints(
+                functionBreakpoints
+            )
+            self.assertTrue(
+                response["success"],
+                "setFunctionBreakpoint failed (%s)" % (response),
+            )
+
         self.dap_server.request_configurationDone()
 
         response = self.dap_server.request_launch(
@@ -511,6 +555,8 @@ class DAPTestCaseBase(TestBase):
         customThreadFormat=None,
         launchCommands=None,
         expectFailure=False,
+        sourceBreakpoints=None,
+        functionBreakpoints=None,
     ):
         """Build the default Makefile target, create the DAP debug adapter,
         and launch the process.
@@ -547,6 +593,8 @@ class DAPTestCaseBase(TestBase):
             customThreadFormat=customThreadFormat,
             launchCommands=launchCommands,
             expectFailure=expectFailure,
+            sourceBreakpoints=sourceBreakpoints,
+            functionBreakpoints=functionBreakpoints,
         )
 
     def getBuiltinDebugServerTool(self):
diff --git a/lldb/source/Host/linux/Host.cpp b/lldb/source/Host/linux/Host.cpp
index 8b475a7..b5f0504 100644
--- a/lldb/source/Host/linux/Host.cpp
+++ b/lldb/source/Host/linux/Host.cpp
@@ -213,6 +213,11 @@ static bool GetStatusInfo(::pid_t Pid, ProcessInstanceInfo &ProcessInfo,
     } else if (Line.consume_front("Tgid:")) {
       Line = Line.ltrim();
       Line.consumeInteger(10, Tgid);
+    } else if (Line.consume_front("CoreDumping:")) {
+      uint32_t coredumping;
+      Line = Line.ltrim();
+      if (!Line.consumeInteger(2, coredumping))
+        ProcessInfo.SetIsCoreDumping(coredumping);
     }
   }
   return true;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
index c79d90d..eee6166 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
@@ -325,7 +325,7 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module,
       auto file = HS.lookupModuleMapFile(*dir, is_framework);
       if (!file)
         return error();
-      if (!HS.loadModuleMapFile(*file, is_system))
+      if (!HS.parseAndLoadModuleMapFile(*file, is_system))
         return error();
     }
   }
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
index 8faf713..0d068ed 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
@@ -350,7 +350,7 @@ bool ItaniumABILanguageRuntime::GetDynamicTypeAndAddress(
   if (offset_to_top_location >= vtable_load_addr)
     return false;
   Status error;
-  const int64_t offset_to_top = m_process->ReadSignedIntegerFromMemory(
+  const int64_t offset_to_top = target.ReadSignedIntegerFromMemory(
       offset_to_top_location, addr_byte_size, INT64_MIN, error);
 
   if (offset_to_top == INT64_MIN)
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 38806df..d2ca5b2 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -75,6 +75,12 @@ Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() {
     }
   }
 
+  // Add a generous buffer of directories, these are quite small
+  // and forks may add new directories upstream LLDB hadn't accounted for
+  // when we started pre-calculating directory size, so this should account for
+  // that
+  m_expected_directories += 100;
+
   m_saved_data_size +=
       m_expected_directories * sizeof(llvm::minidump::Directory);
   Status error;
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 1a2b3d4..45f0447 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -6879,46 +6879,6 @@ size_t TypeSystemClang::GetIndexOfChildMemberWithName(
           name, omit_empty_base_classes, child_indexes);
     } break;
 
-    case clang::Type::ConstantArray: {
-      //                const clang::ConstantArrayType *array =
-      //                llvm::cast<clang::ConstantArrayType>(parent_qual_type.getTypePtr());
-      //                const uint64_t element_count =
-      //                array->getSize().getLimitedValue();
-      //
-      //                if (idx < element_count)
-      //                {
-      //                    std::pair<uint64_t, unsigned> field_type_info =
-      //                    ast->getTypeInfo(array->getElementType());
-      //
-      //                    char element_name[32];
-      //                    ::snprintf (element_name, sizeof (element_name),
-      //                    "%s[%u]", parent_name ? parent_name : "", idx);
-      //
-      //                    child_name.assign(element_name);
-      //                    assert(field_type_info.first % 8 == 0);
-      //                    child_byte_size = field_type_info.first / 8;
-      //                    child_byte_offset = idx * child_byte_size;
-      //                    return array->getElementType().getAsOpaquePtr();
-      //                }
-    } break;
-
-    //        case clang::Type::MemberPointerType:
-    //            {
-    //                MemberPointerType *mem_ptr_type =
-    //                llvm::cast<MemberPointerType>(qual_type.getTypePtr());
-    //                clang::QualType pointee_type =
-    //                mem_ptr_type->getPointeeType();
-    //
-    //                if (TypeSystemClang::IsAggregateType
-    //                (pointee_type.getAsOpaquePtr()))
-    //                {
-    //                    return GetIndexOfChildWithName (ast,
-    //                                                    mem_ptr_type->getPointeeType().getAsOpaquePtr(),
-    //                                                    name);
-    //                }
-    //            }
-    //            break;
-    //
     case clang::Type::LValueReference:
     case clang::Type::RValueReference: {
       const clang::ReferenceType *reference_type =
@@ -7058,46 +7018,6 @@ TypeSystemClang::GetIndexOfChildWithName(lldb::opaque_compiler_type_t type,
           name, omit_empty_base_classes);
     } break;
 
-    case clang::Type::ConstantArray: {
-      //                const clang::ConstantArrayType *array =
-      //                llvm::cast<clang::ConstantArrayType>(parent_qual_type.getTypePtr());
-      //                const uint64_t element_count =
-      //                array->getSize().getLimitedValue();
-      //
-      //                if (idx < element_count)
-      //                {
-      //                    std::pair<uint64_t, unsigned> field_type_info =
-      //                    ast->getTypeInfo(array->getElementType());
-      //
-      //                    char element_name[32];
-      //                    ::snprintf (element_name, sizeof (element_name),
-      //                    "%s[%u]", parent_name ? parent_name : "", idx);
-      //
-      //                    child_name.assign(element_name);
-      //                    assert(field_type_info.first % 8 == 0);
-      //                    child_byte_size = field_type_info.first / 8;
-      //                    child_byte_offset = idx * child_byte_size;
-      //                    return array->getElementType().getAsOpaquePtr();
-      //                }
-    } break;
-
-    //        case clang::Type::MemberPointerType:
-    //            {
-    //                MemberPointerType *mem_ptr_type =
-    //                llvm::cast<MemberPointerType>(qual_type.getTypePtr());
-    //                clang::QualType pointee_type =
-    //                mem_ptr_type->getPointeeType();
-    //
-    //                if (TypeSystemClang::IsAggregateType
-    //                (pointee_type.getAsOpaquePtr()))
-    //                {
-    //                    return GetIndexOfChildWithName (ast,
-    //                                                    mem_ptr_type->getPointeeType().getAsOpaquePtr(),
-    //                                                    name);
-    //                }
-    //            }
-    //            break;
-    //
     case clang::Type::LValueReference:
     case clang::Type::RValueReference: {
       const clang::ReferenceType *reference_type =
@@ -7118,23 +7038,6 @@ TypeSystemClang::GetIndexOfChildWithName(lldb::opaque_compiler_type_t type,
       if (pointee_type.IsAggregateType()) {
         return pointee_type.GetIndexOfChildWithName(name,
                                                     omit_empty_base_classes);
-      } else {
-        //                    if (parent_name)
-        //                    {
-        //                        child_name.assign(1, '*');
-        //                        child_name += parent_name;
-        //                    }
-        //
-        //                    // We have a pointer to an simple type
-        //                    if (idx == 0)
-        //                    {
-        //                        std::pair<uint64_t, unsigned> clang_type_info
-        //                        = ast->getTypeInfo(pointee_type);
-        //                        assert(clang_type_info.first % 8 == 0);
-        //                        child_byte_size = clang_type_info.first / 8;
-        //                        child_byte_offset = 0;
-        //                        return pointee_type.getAsOpaquePtr();
-        //                    }
       }
     } break;
 
diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp
index 9d01293..3de3e5e 100644
--- a/lldb/source/Symbol/Block.cpp
+++ b/lldb/source/Symbol/Block.cpp
@@ -283,39 +283,43 @@ uint32_t Block::GetRangeIndexContainingAddress(const Address &addr) {
   return m_ranges.FindEntryIndexThatContains(file_addr - func_file_addr);
 }
 
+static AddressRange ToAddressRange(const Address &func_addr,
+                                   const Block::Range &block_range) {
+  assert(func_addr.GetModule());
+  return AddressRange(func_addr.GetFileAddress() + block_range.base,
+                      block_range.size,
+                      func_addr.GetModule()->GetSectionList());
+}
+
 bool Block::GetRangeAtIndex(uint32_t range_idx, AddressRange &range) {
   if (range_idx >= m_ranges.GetSize())
     return false;
 
-  Function &function = GetFunction();
-  const Range &vm_range = m_ranges.GetEntryRef(range_idx);
-  range.GetBaseAddress() = function.GetAddress();
-  range.GetBaseAddress().Slide(vm_range.GetRangeBase());
-  range.SetByteSize(vm_range.GetByteSize());
+  Address addr = GetFunction().GetAddress();
+  if (!addr.GetModule())
+    return false;
+
+  range = ToAddressRange(addr, m_ranges.GetEntryRef(range_idx));
   return true;
 }
 
 AddressRanges Block::GetRanges() {
+  Address addr = GetFunction().GetAddress();
+  if (!addr.GetModule())
+    return {};
+
   AddressRanges ranges;
-  Function &function = GetFunction();
-  for (size_t i = 0, e = m_ranges.GetSize(); i < e; ++i) {
-    ranges.emplace_back();
-    auto &range = ranges.back();
-    const Range &vm_range = m_ranges.GetEntryRef(i);
-    range.GetBaseAddress() = function.GetAddress();
-    range.GetBaseAddress().Slide(vm_range.GetRangeBase());
-    range.SetByteSize(vm_range.GetByteSize());
-  }
+  for (size_t i = 0, e = m_ranges.GetSize(); i < e; ++i)
+    ranges.push_back(ToAddressRange(addr, m_ranges.GetEntryRef(i)));
   return ranges;
 }
 
 bool Block::GetStartAddress(Address &addr) {
-  if (m_ranges.IsEmpty())
+  Address func_addr = GetFunction().GetAddress();
+  if (!func_addr.GetModule() || m_ranges.IsEmpty())
     return false;
 
-  Function &function = GetFunction();
-  addr = function.GetAddress();
-  addr.Slide(m_ranges.GetEntryRef(0).GetRangeBase());
+  addr = ToAddressRange(func_addr, m_ranges.GetEntryRef(0)).GetBaseAddress();
   return true;
 }
 
diff --git a/lldb/source/Symbol/DWARFCallFrameInfo.cpp b/lldb/source/Symbol/DWARFCallFrameInfo.cpp
index a763acb..cb8aa8a 100644
--- a/lldb/source/Symbol/DWARFCallFrameInfo.cpp
+++ b/lldb/source/Symbol/DWARFCallFrameInfo.cpp
@@ -151,53 +151,57 @@ DWARFCallFrameInfo::DWARFCallFrameInfo(ObjectFile &objfile,
                                        SectionSP &section_sp, Type type)
     : m_objfile(objfile), m_section_sp(section_sp), m_type(type) {}
 
-bool DWARFCallFrameInfo::GetUnwindPlan(const Address &addr,
-                                       UnwindPlan &unwind_plan) {
-  return GetUnwindPlan(AddressRange(addr, 1), unwind_plan);
+std::unique_ptr<UnwindPlan>
+DWARFCallFrameInfo::GetUnwindPlan(const Address &addr) {
+  return GetUnwindPlan({AddressRange(addr, 1)}, addr);
 }
 
-bool DWARFCallFrameInfo::GetUnwindPlan(const AddressRange &range,
-                                       UnwindPlan &unwind_plan) {
+std::unique_ptr<UnwindPlan>
+DWARFCallFrameInfo::GetUnwindPlan(llvm::ArrayRef<AddressRange> ranges,
+                                  const Address &addr) {
   FDEEntryMap::Entry fde_entry;
-  Address addr = range.GetBaseAddress();
 
   // Make sure that the Address we're searching for is the same object file as
   // this DWARFCallFrameInfo, we only store File offsets in m_fde_index.
   ModuleSP module_sp = addr.GetModule();
   if (module_sp.get() == nullptr || module_sp->GetObjectFile() == nullptr ||
       module_sp->GetObjectFile() != &m_objfile)
-    return false;
+    return nullptr;
 
-  std::optional<FDEEntryMap::Entry> entry = GetFirstFDEEntryInRange(range);
-  if (!entry)
-    return false;
+  std::vector<AddressRange> valid_ranges;
 
-  std::optional<FDE> fde = ParseFDE(entry->data, addr);
-  if (!fde)
-    return false;
-
-  unwind_plan.SetSourceName(m_type == EH ? "eh_frame CFI" : "DWARF CFI");
+  auto result = std::make_unique<UnwindPlan>(GetRegisterKind());
+  result->SetSourceName(m_type == EH ? "eh_frame CFI" : "DWARF CFI");
   // In theory the debug_frame info should be valid at all call sites
   // ("asynchronous unwind info" as it is sometimes called) but in practice
   // gcc et al all emit call frame info for the prologue and call sites, but
   // not for the epilogue or all the other locations during the function
   // reliably.
-  unwind_plan.SetUnwindPlanValidAtAllInstructions(eLazyBoolNo);
-  unwind_plan.SetSourcedFromCompiler(eLazyBoolYes);
-  unwind_plan.SetRegisterKind(GetRegisterKind());
-
-  unwind_plan.SetPlanValidAddressRanges({fde->range});
-  unwind_plan.SetUnwindPlanForSignalTrap(fde->for_signal_trap ? eLazyBoolYes
-                                                              : eLazyBoolNo);
-  unwind_plan.SetReturnAddressRegister(fde->return_addr_reg_num);
-  int64_t slide =
-      fde->range.GetBaseAddress().GetFileAddress() - addr.GetFileAddress();
-  for (UnwindPlan::Row &row : fde->rows) {
-    row.SlideOffset(slide);
-    unwind_plan.AppendRow(std::move(row));
+  result->SetUnwindPlanValidAtAllInstructions(eLazyBoolNo);
+  result->SetSourcedFromCompiler(eLazyBoolYes);
+  result->SetUnwindPlanForSignalTrap(eLazyBoolNo);
+  for (const AddressRange &range : ranges) {
+    std::optional<FDEEntryMap::Entry> entry = GetFirstFDEEntryInRange(range);
+    if (!entry)
+      continue;
+    std::optional<FDE> fde = ParseFDE(entry->data, addr);
+    if (!fde)
+      continue;
+    int64_t slide =
+        fde->range.GetBaseAddress().GetFileAddress() - addr.GetFileAddress();
+    valid_ranges.push_back(std::move(fde->range));
+    if (fde->for_signal_trap)
+      result->SetUnwindPlanForSignalTrap(eLazyBoolYes);
+    result->SetReturnAddressRegister(fde->return_addr_reg_num);
+    for (UnwindPlan::Row &row : fde->rows) {
+      row.SlideOffset(slide);
+      result->AppendRow(std::move(row));
+    }
   }
-
-  return true;
+  result->SetPlanValidAddressRanges(std::move(valid_ranges));
+  if (result->GetRowCount() == 0)
+    return nullptr;
+  return result;
 }
 
 bool DWARFCallFrameInfo::GetAddressRange(Address addr, AddressRange &range) {
diff --git a/lldb/source/Symbol/FuncUnwinders.cpp b/lldb/source/Symbol/FuncUnwinders.cpp
index 1160082..faec24c 100644
--- a/lldb/source/Symbol/FuncUnwinders.cpp
+++ b/lldb/source/Symbol/FuncUnwinders.cpp
@@ -149,13 +149,9 @@ FuncUnwinders::GetEHFrameUnwindPlan(Target &target) {
     return m_unwind_plan_eh_frame_sp;
 
   m_tried_unwind_plan_eh_frame = true;
-  if (m_range.GetBaseAddress().IsValid()) {
-    DWARFCallFrameInfo *eh_frame = m_unwind_table.GetEHFrameInfo();
-    if (eh_frame) {
-      auto plan_sp = std::make_shared<UnwindPlan>(lldb::eRegisterKindGeneric);
-      if (eh_frame->GetUnwindPlan(m_range, *plan_sp))
-        m_unwind_plan_eh_frame_sp = std::move(plan_sp);
-    }
+  if (m_addr.IsValid()) {
+    if (DWARFCallFrameInfo *eh_frame = m_unwind_table.GetEHFrameInfo())
+      m_unwind_plan_eh_frame_sp = eh_frame->GetUnwindPlan(m_ranges, m_addr);
   }
   return m_unwind_plan_eh_frame_sp;
 }
@@ -167,13 +163,10 @@ FuncUnwinders::GetDebugFrameUnwindPlan(Target &target) {
     return m_unwind_plan_debug_frame_sp;
 
   m_tried_unwind_plan_debug_frame = true;
-  if (m_range.GetBaseAddress().IsValid()) {
-    DWARFCallFrameInfo *debug_frame = m_unwind_table.GetDebugFrameInfo();
-    if (debug_frame) {
-      auto plan_sp = std::make_shared<UnwindPlan>(lldb::eRegisterKindGeneric);
-      if (debug_frame->GetUnwindPlan(m_range, *plan_sp))
-        m_unwind_plan_debug_frame_sp = std::move(plan_sp);
-    }
+  if (!m_ranges.empty()) {
+    if (DWARFCallFrameInfo *debug_frame = m_unwind_table.GetDebugFrameInfo())
+      m_unwind_plan_debug_frame_sp =
+          debug_frame->GetUnwindPlan(m_ranges, m_addr);
   }
   return m_unwind_plan_debug_frame_sp;
 }
diff --git a/lldb/source/Symbol/UnwindTable.cpp b/lldb/source/Symbol/UnwindTable.cpp
index 21ecd43..3aca495 100644
--- a/lldb/source/Symbol/UnwindTable.cpp
+++ b/lldb/source/Symbol/UnwindTable.cpp
@@ -122,6 +122,13 @@ AddressRanges UnwindTable::GetAddressRanges(const Address &addr,
   return {};
 }
 
+static Address GetFunctionOrSymbolAddress(const Address &addr,
+                                          const SymbolContext &sc) {
+  if (Address result = sc.GetFunctionOrSymbolAddress(); result.IsValid())
+    return result;
+  return addr;
+}
+
 FuncUnwindersSP
 UnwindTable::GetFuncUnwindersContainingAddress(const Address &addr,
                                                const SymbolContext &sc) {
@@ -131,25 +138,20 @@ UnwindTable::GetFuncUnwindersContainingAddress(const Address &addr,
 
   // There is an UnwindTable per object file, so we can safely use file handles
   addr_t file_addr = addr.GetFileAddress();
-  iterator end = m_unwinds.end();
-  iterator insert_pos = end;
-  if (!m_unwinds.empty()) {
-    insert_pos = m_unwinds.lower_bound(file_addr);
-    iterator pos = insert_pos;
-    if ((pos == m_unwinds.end()) ||
-        (pos != m_unwinds.begin() &&
-         pos->second->GetFunctionStartAddress() != addr))
-      --pos;
-
+  iterator insert_pos = m_unwinds.upper_bound(file_addr);
+  if (insert_pos != m_unwinds.begin()) {
+    auto pos = std::prev(insert_pos);
     if (pos->second->ContainsAddress(addr))
       return pos->second;
   }
 
+  Address start_addr = GetFunctionOrSymbolAddress(addr, sc);
   AddressRanges ranges = GetAddressRanges(addr, sc);
   if (ranges.empty())
     return nullptr;
 
-  auto func_unwinder_sp = std::make_shared<FuncUnwinders>(*this, addr, ranges);
+  auto func_unwinder_sp =
+      std::make_shared<FuncUnwinders>(*this, start_addr, ranges);
   for (const AddressRange &range : ranges)
     m_unwinds.emplace_hint(insert_pos, range.GetBaseAddress().GetFileAddress(),
                            func_unwinder_sp);
@@ -164,11 +166,12 @@ FuncUnwindersSP UnwindTable::GetUncachedFuncUnwindersContainingAddress(
     const Address &addr, const SymbolContext &sc) {
   Initialize();
 
+  Address start_addr = GetFunctionOrSymbolAddress(addr, sc);
   AddressRanges ranges = GetAddressRanges(addr, sc);
   if (ranges.empty())
     return nullptr;
 
-  return std::make_shared<FuncUnwinders>(*this, addr, std::move(ranges));
+  return std::make_shared<FuncUnwinders>(*this, start_addr, std::move(ranges));
 }
 
 void UnwindTable::Dump(Stream &s) {
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index 3ed49e1..4c760b8 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -868,13 +868,11 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() {
 
     // Even with -fomit-frame-pointer, we can try eh_frame to get back on
     // track.
-    DWARFCallFrameInfo *eh_frame =
-        pc_module_sp->GetUnwindTable().GetEHFrameInfo();
-    if (eh_frame) {
-      auto unwind_plan_sp =
-          std::make_shared<UnwindPlan>(lldb::eRegisterKindGeneric);
-      if (eh_frame->GetUnwindPlan(m_current_pc, *unwind_plan_sp))
-        return unwind_plan_sp;
+    if (DWARFCallFrameInfo *eh_frame =
+            pc_module_sp->GetUnwindTable().GetEHFrameInfo()) {
+      if (std::unique_ptr<UnwindPlan> plan_up =
+              eh_frame->GetUnwindPlan(m_current_pc))
+        return plan_up;
     }
 
     ArmUnwindInfo *arm_exidx =
@@ -1345,9 +1343,9 @@ RegisterContextUnwind::SavedLocationForRegister(
         // value instead of the Return Address register.
         // If $pc is not available, fall back to the RA reg.
         UnwindPlan::Row::AbstractRegisterLocation scratch;
-        if (m_frame_type == eTrapHandlerFrame &&
-            active_row->GetRegisterInfo
-              (pc_regnum.GetAsKind (unwindplan_registerkind), scratch)) {
+        if (m_frame_type == eTrapHandlerFrame && active_row &&
+            active_row->GetRegisterInfo(
+                pc_regnum.GetAsKind(unwindplan_registerkind), scratch)) {
           UnwindLogMsg("Providing pc register instead of rewriting to "
                        "RA reg because this is a trap handler and there is "
                        "a location for the saved pc register value.");
@@ -1377,7 +1375,7 @@ RegisterContextUnwind::SavedLocationForRegister(
         }
       }
 
-      if (regnum.IsValid() &&
+      if (regnum.IsValid() && active_row &&
           active_row->GetRegisterInfo(regnum.GetAsKind(unwindplan_registerkind),
                                       unwindplan_regloc)) {
         have_unwindplan_regloc = true;
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e90e748..7f61f86 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -2270,6 +2270,17 @@ size_t Target::ReadScalarIntegerFromMemory(const Address &addr, uint32_t byte_si
   return 0;
 }
 
+int64_t Target::ReadSignedIntegerFromMemory(const Address &addr,
+                                            size_t integer_byte_size,
+                                            int64_t fail_value, Status &error,
+                                            bool force_live_memory) {
+  Scalar scalar;
+  if (ReadScalarIntegerFromMemory(addr, integer_byte_size, false, scalar, error,
+                                  force_live_memory))
+    return scalar.SLongLong(fail_value);
+  return fail_value;
+}
+
 uint64_t Target::ReadUnsignedIntegerFromMemory(const Address &addr,
                                                size_t integer_byte_size,
                                                uint64_t fail_value, Status &error,
diff --git a/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py b/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py
index 634bd13..cd95a9f 100644
--- a/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py
+++ b/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py
@@ -279,3 +279,55 @@ class DynamicValueTestCase(TestBase):
             "frame var -d run-target --ptr-depth=1 --show-types a",
             substrs=["(B *) a", "m_b_value = 10"],
         )
+
+    @no_debug_info_test
+    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24663")
+    @expectedFailureDarwin  # dynamic loader unloads modules
+    @expectedFailureAll(archs=["arm"]) # Minidump saving not implemented
+    def test_from_core_file(self):
+        """Test fetching C++ dynamic values from core files. Specifically, test
+        that we can determine the dynamic type of the value if the core file
+        does not contain the type vtable."""
+        self.build()
+        lldbutil.run_to_name_breakpoint(self, "take_A")
+
+        # Get the address of our object and its vtable
+        a = self.frame().FindVariable("a")
+        self.assertSuccess(a.GetError())
+        vtable = a.GetVTable()
+        self.assertSuccess(vtable.GetError())
+        a = a.GetValueAsAddress()
+        vtable = vtable.GetValueAsAddress()
+
+        # Create a core file which will only contain the memory region
+        # containing `a`. The object is on the stack, so this will automatically
+        # include the stack of the main thread.
+        core = self.getBuildArtifact("a.dmp")
+        options = lldb.SBSaveCoreOptions()
+        options.SetPluginName("minidump")
+        options.SetStyle(lldb.eSaveCoreCustomOnly)
+        options.SetOutputFile(lldb.SBFileSpec(core))
+        region = lldb.SBMemoryRegionInfo()
+        self.assertSuccess(self.process().GetMemoryRegionInfo(a, region))
+        self.assertSuccess(options.AddMemoryRegionToSave(region))
+
+        # Save the core file and load it.
+        self.assertSuccess(self.process().SaveCore(options))
+        self.process().Kill()
+        error = lldb.SBError()
+        self.target().LoadCore(core, error)
+        self.assertSuccess(error)
+
+        # Sanity check -- the process should be able to read the object but not
+        # its vtable..
+        self.process().ReadPointerFromMemory(a, error)
+        self.assertSuccess(error)
+        self.process().ReadPointerFromMemory(vtable, error)
+        self.assertTrue(error.Fail())
+
+        # .. but we should still be able to see the dynamic type by reading the
+        # vtable from the executable file.
+        self.expect(
+            "frame var -d run-target --ptr-depth=1 --show-types a",
+            substrs=["(B *) a", "m_b_value = 10"],
+        )
diff --git a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
index 455ac84..a94288c7 100644
--- a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
+++ b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
@@ -2,7 +2,6 @@
 Test lldb-dap completions request
 """
 
-
 import lldbdap_testcase
 import dap_server
 from lldbsuite.test import lldbutil
@@ -32,6 +31,7 @@ variable_var_completion = {
 variable_var1_completion = {"text": "var1", "label": "var1 -- int &"}
 variable_var2_completion = {"text": "var2", "label": "var2 -- int &"}
 
+
 # Older version of libcxx produce slightly different typename strings for
 # templates like vector.
 @skipIf(compiler="clang", compiler_version=["<", "16.0"])
@@ -43,16 +43,22 @@ class TestDAP_completions(lldbdap_testcase.DAPTestCaseBase):
         for not_expected_item in not_expected_list:
             self.assertNotIn(not_expected_item, actual_list)
 
-
     def setup_debugee(self, stopOnEntry=False):
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program, stopOnEntry=stopOnEntry)
-
         source = "main.cpp"
-        breakpoint1_line = line_number(source, "// breakpoint 1")
-        breakpoint2_line = line_number(source, "// breakpoint 2")
-
-        self.set_source_breakpoints(source, [breakpoint1_line, breakpoint2_line])
+        self.build_and_launch(
+            program,
+            stopOnEntry=stopOnEntry,
+            sourceBreakpoints=[
+                (
+                    source,
+                    [
+                        line_number(source, "// breakpoint 1"),
+                        line_number(source, "// breakpoint 2"),
+                    ],
+                ),
+            ],
+        )
 
     def test_command_completions(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
index 65a1bc0..8642e31 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
@@ -19,6 +19,7 @@ def get_subprocess(root_process, process_name):
 
     self.assertTrue(False, "No subprocess with name %s found" % process_name)
 
+
 class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
     def check_lldb_command(
         self, lldb_command, contains_string, assert_msg, command_escape_prefix="`"
@@ -52,7 +53,7 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         character.
         """
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program)
+        self.build_and_launch(program, stopOnEntry=True)
         source = "main.cpp"
         breakpoint1_line = line_number(source, "// breakpoint 1")
         lines = [breakpoint1_line]
@@ -81,7 +82,7 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
 
     def test_custom_escape_prefix(self):
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program, commandEscapePrefix="::")
+        self.build_and_launch(program, stopOnEntry=True, commandEscapePrefix="::")
         source = "main.cpp"
         breakpoint1_line = line_number(source, "// breakpoint 1")
         breakpoint_ids = self.set_source_breakpoints(source, [breakpoint1_line])
@@ -96,7 +97,7 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
 
     def test_empty_escape_prefix(self):
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program, commandEscapePrefix="")
+        self.build_and_launch(program, stopOnEntry=True, commandEscapePrefix="")
         source = "main.cpp"
         breakpoint1_line = line_number(source, "// breakpoint 1")
         breakpoint_ids = self.set_source_breakpoints(source, [breakpoint1_line])
@@ -113,7 +114,7 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
     def test_exit_status_message_sigterm(self):
         source = "main.cpp"
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program, commandEscapePrefix="")
+        self.build_and_launch(program, stopOnEntry=True, commandEscapePrefix="")
         breakpoint1_line = line_number(source, "// breakpoint 1")
         breakpoint_ids = self.set_source_breakpoints(source, [breakpoint1_line])
         self.continue_to_breakpoints(breakpoint_ids)
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
index e367c32..23500bd 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
@@ -16,7 +16,9 @@ class TestDAP_redirection_to_console(lldbdap_testcase.DAPTestCaseBase):
         """
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(
-            program, lldbDAPEnv={"LLDB_DAP_TEST_STDOUT_STDERR_REDIRECTION": ""}
+            program,
+            stopOnEntry=True,
+            lldbDAPEnv={"LLDB_DAP_TEST_STDOUT_STDERR_REDIRECTION": ""},
         )
 
         source = "main.cpp"
diff --git a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
index 39d7373..ec7387d 100644
--- a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
+++ b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
@@ -2,7 +2,6 @@
 Test exception behavior in DAP with signal.
 """
 
-
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 import lldbdap_testcase
@@ -17,7 +16,7 @@ class TestDAP_exception(lldbdap_testcase.DAPTestCaseBase):
         """
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
-        self.dap_server.request_continue()
+
         self.assertTrue(self.verify_stop_exception_info("signal SIGABRT"))
         exceptionInfo = self.get_exceptionInfo()
         self.assertEqual(exceptionInfo["breakMode"], "always")
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index 604a4167..e8e9181 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -15,6 +15,7 @@ import re
 # Despite the test program printing correctly. See
 # https://github.com/llvm/llvm-project/issues/137599.
 
+
 class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     def test_default(self):
@@ -357,6 +358,7 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         terminateCommands = ["expr 4+2"]
         self.build_and_launch(
             program,
+            stopOnEntry=True,
             initCommands=initCommands,
             preRunCommands=preRunCommands,
             postRunCommands=postRunCommands,
@@ -530,6 +532,7 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         terminateCommands = ["expr 4+2"]
         self.launch(
             program=program,
+            stopOnEntry=True,
             terminateCommands=terminateCommands,
             disconnectAutomatically=False,
         )
diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
index ce262be..64cec70a 100644
--- a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
+++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
@@ -16,12 +16,14 @@ class TestDAP_sendEvent(lldbdap_testcase.DAPTestCaseBase):
         """
         program = self.getBuildArtifact("a.out")
         source = "main.c"
+        breakpoint_line = line_number(source, "// breakpoint")
         custom_event_body = {
             "key": 321,
             "arr": [True],
         }
         self.build_and_launch(
             program,
+            sourceBreakpoints=[(source, [breakpoint_line])],
             stopCommands=[
                 "lldb-dap send-event my-custom-event-no-body",
                 "lldb-dap send-event my-custom-event '{}'".format(
@@ -30,11 +32,6 @@ class TestDAP_sendEvent(lldbdap_testcase.DAPTestCaseBase):
             ],
         )
 
-        breakpoint_line = line_number(source, "// breakpoint")
-
-        self.set_source_breakpoints(source, [breakpoint_line])
-        self.continue_to_next_stop()
-
         custom_event = self.dap_server.wait_for_event(
             filter=["my-custom-event-no-body"]
         )
diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
index 4e2a76c..edf4ada 100644
--- a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
+++ b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
@@ -61,7 +61,7 @@ class TestDAP_stackTrace(lldbdap_testcase.DAPTestCaseBase):
         Tests the 'stackTrace' packet and all its variants.
         """
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program)
+        self.build_and_launch(program, stopOnEntry=True)
         source = "main.c"
         self.source_path = os.path.join(os.getcwd(), source)
         self.recurse_end = line_number(source, "recurse end")
diff --git a/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py b/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py
index 08c225b..963d711 100644
--- a/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py
+++ b/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py
@@ -37,7 +37,7 @@ class TestDAP_stackTraceMissingSourcePath(lldbdap_testcase.DAPTestCaseBase):
             breakpoint_line = line_number(other_source_file, "// Break here")
 
             program = self.getBuildArtifact("a.out")
-            self.build_and_launch(program, commandEscapePrefix="")
+            self.build_and_launch(program, stopOnEntry=True, commandEscapePrefix="")
 
             breakpoint_ids = self.set_source_breakpoints(
                 other_source_file, [breakpoint_line]
diff --git a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
index fd452d9..e37cd36 100644
--- a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
+++ b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
@@ -2,7 +2,6 @@
 Test lldb-dap start-debugging reverse requests.
 """
 
-
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 import lldbdap_testcase
@@ -16,7 +15,7 @@ class TestDAP_startDebugging(lldbdap_testcase.DAPTestCaseBase):
         """
         program = self.getBuildArtifact("a.out")
         source = "main.c"
-        self.build_and_launch(program)
+        self.build_and_launch(program, stopOnEntry=True)
 
         breakpoint_line = line_number(source, "// breakpoint")
 
diff --git a/lldb/test/API/tools/lldb-dap/variables/children/TestDAP_variables_children.py b/lldb/test/API/tools/lldb-dap/variables/children/TestDAP_variables_children.py
index a9371e5..eb09649 100644
--- a/lldb/test/API/tools/lldb-dap/variables/children/TestDAP_variables_children.py
+++ b/lldb/test/API/tools/lldb-dap/variables/children/TestDAP_variables_children.py
@@ -13,13 +13,13 @@ class TestDAP_variables_children(lldbdap_testcase.DAPTestCaseBase):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(
             program,
+            stopOnEntry=True,
             preRunCommands=[
                 "command script import '%s'" % self.getSourcePath("formatter.py")
             ],
         )
         source = "main.cpp"
         breakpoint1_line = line_number(source, "// break here")
-        lines = [breakpoint1_line]
 
         breakpoint_ids = self.set_source_breakpoints(
             source, [line_number(source, "// break here")]
@@ -47,7 +47,7 @@ class TestDAP_variables_children(lldbdap_testcase.DAPTestCaseBase):
         Test the stepping out of a function with return value show the children correctly
         """
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program)
+        self.build_and_launch(program, stopOnEntry=True)
 
         function_name = "test_return_variable_with_children"
         breakpoint_ids = self.set_function_breakpoints([function_name])
diff --git a/lldb/test/Shell/Commands/command-disassemble-sections.s b/lldb/test/Shell/Commands/command-disassemble-sections.s
new file mode 100644
index 0000000..d278527
--- /dev/null
+++ b/lldb/test/Shell/Commands/command-disassemble-sections.s
@@ -0,0 +1,110 @@
+## Test disassembling of functions which are spread over multiple sections (ELF
+## segments are modelled as LLDB sections).
+
+
+# REQUIRES: x86, lld
+
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux %t/file.s -o %t/file.o
+# RUN: ld.lld %t/file.o -o %t/file.out -T %t/file.lds
+# RUN: %lldb %t/file.out -o "disassemble --name func1" -o exit | FileCheck %s
+
+# CHECK:      (lldb) disassemble --name func1
+# CHECK:      file.out`func1:
+# CHECK-NEXT: file.out[0x0] <+0>: int    $0x2a
+# CHECK:      file.out`func1:
+# CHECK-NEXT: file.out[0x1000] <+4096>: int    $0x2f
+
+
+#--- file.lds
+## Linker script placing the parts of the section into different segments
+## (typically one of these would be for the "hot" code).
+PHDRS {
+  text1 PT_LOAD;
+  text2 PT_LOAD;
+}
+SECTIONS {
+  . = 0;
+  .text.part1 : { *(.text.part1) } :text1
+  .text.part2 : { *(.text.part2) } :text2
+}
+
+#--- file.s
+## A very simple function consisting of two parts and DWARF describing the
+## function.
+        .section        .text.part1,"ax",@progbits
+        .p2align 12
+func1:
+        int $42
+.Lfunc1_end:
+
+        .section        .text.part2,"ax",@progbits
+        .p2align 12
+func1.__part.1:
+        int $47
+.Lfunc1.__part.1_end:
+
+
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                               # Abbreviation Code
+        .byte   17                              # DW_TAG_compile_unit
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   37                              # DW_AT_producer
+        .byte   8                               # DW_FORM_string
+        .byte   19                              # DW_AT_language
+        .byte   5                               # DW_FORM_data2
+        .byte   17                              # DW_AT_low_pc
+        .byte   1                               # DW_FORM_addr
+        .byte   85                              # DW_AT_ranges
+        .byte   23                              # DW_FORM_sec_offset
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   2                               # Abbreviation Code
+        .byte   46                              # DW_TAG_subprogram
+        .byte   0                               # DW_CHILDREN_no
+        .byte   85                              # DW_AT_ranges
+        .byte   23                              # DW_FORM_sec_offset
+        .byte   3                               # DW_AT_name
+        .byte   8                               # DW_FORM_string
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   0                               # EOM(3)
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                               # DWARF version number
+        .byte   1                               # DWARF Unit Type
+        .byte   8                               # Address Size (in bytes)
+        .long   .debug_abbrev                   # Offset Into Abbrev. Section
+        .byte   1                               # Abbrev DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"            # DW_AT_producer
+        .short  29                              # DW_AT_language
+        .quad   0                               # DW_AT_low_pc
+        .long   .Ldebug_ranges0                 # DW_AT_ranges
+        .byte   2                               # Abbrev DW_TAG_subprogram
+        .long   .Ldebug_ranges0                 # DW_AT_ranges
+        .asciz  "func1"                         # DW_AT_name
+        .byte   0                               # End Of Children Mark
+.Ldebug_info_end0:
+
+        .section        .debug_rnglists,"",@progbits
+        .long   .Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+        .short  5                               # Version
+        .byte   8                               # Address size
+        .byte   0                               # Segment selector size
+        .long   1                               # Offset entry count
+.Lrnglists_table_base0:
+        .long   .Ldebug_ranges0-.Lrnglists_table_base0
+.Ldebug_ranges0:
+        .byte   6                               # DW_RLE_start_end
+        .quad func1
+        .quad .Lfunc1_end
+        .byte   6                               # DW_RLE_start_end
+        .quad func1.__part.1
+        .quad .Lfunc1.__part.1_end
+        .byte   0                               # DW_RLE_end_of_list
+.Ldebug_list_header_end0:
diff --git a/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s b/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s
index c405e51..ede04c88 100644
--- a/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s
+++ b/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s
@@ -4,7 +4,9 @@
 # int bar() { return foo(0); }
 # int foo(int flag) { return flag ? bar() : baz(); }
 # int main() { return foo(1); }
-# The function bar has been placed "in the middle" of foo.
+# The function bar has been placed "in the middle" of foo. The functions are not
+# using the frame pointer register and the are deliberately adjusting the stack
+# pointer to test that we're using the correct unwind row.
 
         .text
 
@@ -20,26 +22,29 @@ baz:
         .type   foo,@function
 foo:
         .cfi_startproc
-        pushq   %rbp
+        pushq   %rbx
         .cfi_def_cfa_offset 16
-        .cfi_offset %rbp, -16
-        movq    %rsp, %rbp
-        .cfi_def_cfa_register %rbp
-        subq    $16, %rsp
-        movl    %edi, -8(%rbp)
-        cmpl    $0, -8(%rbp)
+        .cfi_offset %rbx, -16
+        movl    %edi, %ebx
+        cmpl    $0, %ebx
         je      foo.__part.2
         jmp     foo.__part.1
         .cfi_endproc
 .Lfoo_end:
         .size   foo, .Lfoo_end-foo
 
+# NB: Deliberately inserting padding to separate the two parts of the function
+# as we're currently only parsing a single FDE entry from a (coalesced) address
+# range.
+        nop
+
 foo.__part.1:
         .cfi_startproc
-        .cfi_def_cfa %rbp, 16
-        .cfi_offset %rbp, -16
+        .cfi_def_cfa_offset 16
+        .cfi_offset %rbx, -16
+        subq    $16, %rsp
+        .cfi_def_cfa_offset 32
         callq   bar
-        movl    %eax, -4(%rbp)
         jmp     foo.__part.3
 .Lfoo.__part.1_end:
         .size   foo.__part.1, .Lfoo.__part.1_end-foo.__part.1
@@ -47,8 +52,6 @@ foo.__part.1:
 
 bar:
         .cfi_startproc
-# NB: Decrease the stack pointer to make the unwind info for this function
-# different from the surrounding foo function.
         subq    $24, %rsp
         .cfi_def_cfa_offset 32
         xorl    %edi, %edi
@@ -62,22 +65,26 @@ bar:
 
 foo.__part.2:
         .cfi_startproc
-        .cfi_def_cfa %rbp, 16
-        .cfi_offset %rbp, -16
+        .cfi_def_cfa_offset 16
+        .cfi_offset %rbx, -16
+        subq    $16, %rsp
+        .cfi_def_cfa_offset 32
         callq   baz
-        movl    %eax, -4(%rbp)
         jmp     foo.__part.3
 .Lfoo.__part.2_end:
         .size   foo.__part.2, .Lfoo.__part.2_end-foo.__part.2
         .cfi_endproc
 
+# NB: Deliberately inserting padding to separate the two parts of the function
+# as we're currently only parsing a single FDE entry from a (coalesced) address
+# range.
+        nop
+
 foo.__part.3:
         .cfi_startproc
-        .cfi_def_cfa %rbp, 16
-        .cfi_offset %rbp, -16
-        movl    -4(%rbp), %eax
-        addq    $16, %rsp
-        popq    %rbp
+        .cfi_def_cfa_offset 32
+        .cfi_offset %rbx, -16
+        addq    $24, %rsp
         .cfi_def_cfa %rsp, 8
         retq
 .Lfoo.__part.3_end:
@@ -186,9 +193,8 @@ main:
         .byte   86
         .asciz  "foo"                           # DW_AT_name
         .byte   4                               # Abbrev [4] DW_TAG_formal_parameter
-        .byte   2                               # DW_AT_location
-        .byte   145
-        .byte   120
+        .byte   1                               # DW_AT_location
+        .byte   0x53                            # DW_OP_reg3
         .asciz  "flag"                          # DW_AT_name
         .long   .Lint-.Lcu_begin0               # DW_AT_type
         .byte   0                               # End Of Children Mark
diff --git a/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test b/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test
index 9f94468..a4ed73e 100644
--- a/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test
+++ b/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test
@@ -17,23 +17,32 @@
 image show-unwind --cached true -n foo
 # CHECK: UNWIND PLANS for {{.*}}`foo
 #
-# CHECK:      Assembly language inspection UnwindPlan:
-# CHECK-NEXT: This UnwindPlan originally sourced from assembly insn profiling
-# CHECK-NEXT: This UnwindPlan is sourced from the compiler: no.
-# CHECK-NEXT: This UnwindPlan is valid at all instruction locations: yes.
+# CHECK:      eh_frame UnwindPlan:
+# CHECK-NEXT: This UnwindPlan originally sourced from eh_frame CFI
+# CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes.
+# CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no.
 # CHECK-NEXT: This UnwindPlan is for a trap handler function: no.
-# TODO: This address range isn't correct right now. We're just checking that
-# it's a different range from the one in the next query.
-# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 6-0x0000000000000046)
+# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 6-0x0000000000000010)[{{.*}}.text + 17-0x000000000000001c)[{{.*}}.text + 44-0x0000000000000037)[{{.*}}.text + 56-0x000000000000003d)
+# CHECK-NEXT: row[0]:    0: CFA=rsp +8 => rip=[CFA-8]
+# CHECK-NEXT: row[1]:    1: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8]
+# CHECK-NEXT: row[2]:   11: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8]
+# CHECK-NEXT: row[3]:   15: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8]
+# CHECK-NEXT: row[4]:   38: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8]
+# CHECK-NEXT: row[5]:   42: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8]
+# CHECK-NEXT: row[6]:   50: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8]
+# CHECK-NEXT: row[7]:   54: CFA=rsp +8 => rbx=[CFA-16] rip=[CFA-8]
+# CHECK-EMPTY:
 
 image show-unwind --cached true -n bar
 # CHECK: UNWIND PLANS for {{.*}}`bar
 
-# CHECK:      Assembly language inspection UnwindPlan:
-# CHECK-NEXT: This UnwindPlan originally sourced from assembly insn profiling
-# CHECK-NEXT: This UnwindPlan is sourced from the compiler: no.
-# CHECK-NEXT: This UnwindPlan is valid at all instruction locations: yes.
+# CHECK:      eh_frame UnwindPlan:
+# CHECK-NEXT: This UnwindPlan originally sourced from eh_frame CFI
+# CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes.
+# CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no.
 # CHECK-NEXT: This UnwindPlan is for a trap handler function: no.
-# TODO: This address range isn't correct right now. We're just checking that
-# it's a different range from the one in the previous query.
-# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 35-0x0000000000000033)
+# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 28-0x000000000000002c)
+# CHECK-NEXT: row[0]:    0: CFA=rsp +8 => rip=[CFA-8]
+# CHECK-NEXT: row[1]:    4: CFA=rsp+32 => rip=[CFA-8]
+# CHECK-NEXT: row[2]:   15: CFA=rsp +8 => rip=[CFA-8]
+# CHECK-EMPTY:
diff --git a/lldb/test/Shell/Unwind/signal-in-leaf-function-aarch64.test b/lldb/test/Shell/Unwind/signal-in-leaf-function-aarch64.test
index 2ac2d4a..050c41c 100644
--- a/lldb/test/Shell/Unwind/signal-in-leaf-function-aarch64.test
+++ b/lldb/test/Shell/Unwind/signal-in-leaf-function-aarch64.test
@@ -1,6 +1,8 @@
 # REQUIRES: target-aarch64 && native
 # UNSUPPORTED: system-windows
 # llvm.org/pr91610, rdar://128031075
+
+# Darwin _sigtramp doesn't have eh_frame instruction on AArch64
 # XFAIL: system-darwin
 
 
diff --git a/lldb/tools/debugserver/source/DNBTimer.h b/lldb/tools/debugserver/source/DNBTimer.h
index ad15154..cc409cf 100644
--- a/lldb/tools/debugserver/source/DNBTimer.h
+++ b/lldb/tools/debugserver/source/DNBTimer.h
@@ -16,6 +16,7 @@
 #include "DNBDefs.h"
 #include <cstdint>
 #include <mutex>
+#include <optional>
 #include <sys/time.h>
 
 class DNBTimer {
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 4734c9d..f66badc 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -242,527 +242,527 @@
           }
         }
       }
-    ]
-  },
-  "breakpoints": [
-    {
-      "language": "ada"
-    },
-    {
-      "language": "arm"
-    },
-    {
-      "language": "asm"
-    },
-    {
-      "language": "c"
-    },
-    {
-      "language": "cpp"
-    },
-    {
-      "language": "crystal"
-    },
-    {
-      "language": "d"
-    },
-    {
-      "language": "fortan"
-    },
-    {
-      "language": "fortran-modern"
-    },
-    {
-      "language": "nim"
-    },
-    {
-      "language": "objective-c"
-    },
-    {
-      "language": "objectpascal"
-    },
-    {
-      "language": "pascal"
-    },
-    {
-      "language": "rust"
-    },
-    {
-      "language": "swift"
-    }
-  ],
-  "debuggers": [
-    {
-      "type": "lldb-dap",
-      "label": "LLDB DAP Debugger",
-      "configurationAttributes": {
-        "launch": {
-          "required": [
-            "program"
-          ],
-          "properties": {
-            "debugAdapterHostname": {
-              "type": "string",
-              "markdownDescription": "The hostname that an existing lldb-dap executable is listening on."
-            },
-            "debugAdapterPort": {
-              "type": "number",
-              "markdownDescription": "The port that an existing lldb-dap executable is listening on."
-            },
-            "debugAdapterExecutable": {
-              "type": "string",
-              "markdownDescription": "The absolute path to the LLDB debug adapter executable to use. Overrides any user or workspace settings."
-            },
-            "debugAdapterArgs": {
-              "type": "array",
-              "items": {
-                "type": "string"
+    ],
+    "breakpoints": [
+      {
+        "language": "ada"
+      },
+      {
+        "language": "arm"
+      },
+      {
+        "language": "asm"
+      },
+      {
+        "language": "c"
+      },
+      {
+        "language": "cpp"
+      },
+      {
+        "language": "crystal"
+      },
+      {
+        "language": "d"
+      },
+      {
+        "language": "fortan"
+      },
+      {
+        "language": "fortran-modern"
+      },
+      {
+        "language": "nim"
+      },
+      {
+        "language": "objective-c"
+      },
+      {
+        "language": "objectpascal"
+      },
+      {
+        "language": "pascal"
+      },
+      {
+        "language": "rust"
+      },
+      {
+        "language": "swift"
+      }
+    ],
+    "debuggers": [
+      {
+        "type": "lldb-dap",
+        "label": "LLDB DAP Debugger",
+        "configurationAttributes": {
+          "launch": {
+            "required": [
+              "program"
+            ],
+            "properties": {
+              "debugAdapterHostname": {
+                "type": "string",
+                "markdownDescription": "The hostname that an existing lldb-dap executable is listening on."
               },
-              "markdownDescription": "The list of additional arguments used to launch the debug adapter executable. Overrides any user or workspace settings."
-            },
-            "program": {
-              "type": "string",
-              "description": "Path to the program to debug."
-            },
-            "args": {
-              "type": [
-                "array"
-              ],
-              "items": {
-                "type": "string"
-              },
-              "description": "Program arguments.",
-              "default": []
-            },
-            "cwd": {
-              "type": "string",
-              "description": "Program working directory.",
-              "default": "${workspaceRoot}"
-            },
-            "env": {
-              "anyOf": [
-                {
-                  "type": "object",
-                  "description": "Additional environment variables to set when launching the program. E.g. `{ \"FOO\": \"1\" }`",
-                  "patternProperties": {
-                    ".*": {
-                      "type": "string"
-                    }
-                  },
-                  "default": {}
+              "debugAdapterPort": {
+                "type": "number",
+                "markdownDescription": "The port that an existing lldb-dap executable is listening on."
+              },
+              "debugAdapterExecutable": {
+                "type": "string",
+                "markdownDescription": "The absolute path to the LLDB debug adapter executable to use. Overrides any user or workspace settings."
+              },
+              "debugAdapterArgs": {
+                "type": "array",
+                "items": {
+                  "type": "string"
                 },
-                {
-                  "type": "array",
-                  "description": "Additional environment variables to set when launching the program. E.g. `[\"FOO=1\", \"BAR\"]`",
-                  "items": {
-                    "type": "string",
-                    "pattern": "^((\\w+=.*)|^\\w+)$"
-                  },
-                  "default": []
-                }
-              ]
-            },
-            "stopOnEntry": {
-              "type": "boolean",
-              "description": "Automatically stop after launch.",
-              "default": false
-            },
-            "disableASLR": {
-              "type": "boolean",
-              "description": "Enable or disable Address space layout randomization if the debugger supports it.",
-              "default": true
-            },
-            "disableSTDIO": {
-              "type": "boolean",
-              "description": "Don't retrieve STDIN, STDOUT and STDERR as the program is running.",
-              "default": false
-            },
-            "shellExpandArguments": {
-              "type": "boolean",
-              "description": "Expand program arguments as a shell would without actually launching the program in a shell.",
-              "default": false
-            },
-            "detachOnError": {
-              "type": "boolean",
-              "description": "Detach from the program.",
-              "default": false
-            },
-            "sourcePath": {
-              "type": "string",
-              "description": "Specify a source path to remap \"./\" to allow full paths to be used when setting breakpoints in binaries that have relative source paths."
-            },
-            "sourceMap": {
-              "anyOf": [
-                {
-                  "type": "object",
-                  "description": "Specify an object of path remappings; each entry has a key containing the source path and a value containing the destination path. E.g `{ \"/the/source/path\": \"/the/destination/path\" }`. Overrides sourcePath.",
-                  "patternProperties": {
-                    ".*": {
-                      "type": "string"
-                    }
-                  },
-                  "default": {}
+                "markdownDescription": "The list of additional arguments used to launch the debug adapter executable. Overrides any user or workspace settings."
+              },
+              "program": {
+                "type": "string",
+                "description": "Path to the program to debug."
+              },
+              "args": {
+                "type": [
+                  "array"
+                ],
+                "items": {
+                  "type": "string"
                 },
-                {
-                  "type": "array",
-                  "description": "Specify an array of path remappings; each element must itself be a two element array containing a source and destination path name. Overrides sourcePath.",
-                  "items": {
+                "description": "Program arguments.",
+                "default": []
+              },
+              "cwd": {
+                "type": "string",
+                "description": "Program working directory.",
+                "default": "${workspaceRoot}"
+              },
+              "env": {
+                "anyOf": [
+                  {
+                    "type": "object",
+                    "description": "Additional environment variables to set when launching the program. E.g. `{ \"FOO\": \"1\" }`",
+                    "patternProperties": {
+                      ".*": {
+                        "type": "string"
+                      }
+                    },
+                    "default": {}
+                  },
+                  {
                     "type": "array",
-                    "minItems": 2,
-                    "maxItems": 2,
+                    "description": "Additional environment variables to set when launching the program. E.g. `[\"FOO=1\", \"BAR\"]`",
                     "items": {
-                      "type": "string"
-                    }
+                      "type": "string",
+                      "pattern": "^((\\w+=.*)|^\\w+)$"
+                    },
+                    "default": []
+                  }
+                ]
+              },
+              "stopOnEntry": {
+                "type": "boolean",
+                "description": "Automatically stop after launch.",
+                "default": false
+              },
+              "disableASLR": {
+                "type": "boolean",
+                "description": "Enable or disable Address space layout randomization if the debugger supports it.",
+                "default": true
+              },
+              "disableSTDIO": {
+                "type": "boolean",
+                "description": "Don't retrieve STDIN, STDOUT and STDERR as the program is running.",
+                "default": false
+              },
+              "shellExpandArguments": {
+                "type": "boolean",
+                "description": "Expand program arguments as a shell would without actually launching the program in a shell.",
+                "default": false
+              },
+              "detachOnError": {
+                "type": "boolean",
+                "description": "Detach from the program.",
+                "default": false
+              },
+              "sourcePath": {
+                "type": "string",
+                "description": "Specify a source path to remap \"./\" to allow full paths to be used when setting breakpoints in binaries that have relative source paths."
+              },
+              "sourceMap": {
+                "anyOf": [
+                  {
+                    "type": "object",
+                    "description": "Specify an object of path remappings; each entry has a key containing the source path and a value containing the destination path. E.g `{ \"/the/source/path\": \"/the/destination/path\" }`. Overrides sourcePath.",
+                    "patternProperties": {
+                      ".*": {
+                        "type": "string"
+                      }
+                    },
+                    "default": {}
                   },
-                  "default": []
-                }
-              ]
-            },
-            "debuggerRoot": {
-              "type": "string",
-              "description": "Specify a working directory to set the debug adapter to so relative object files can be located."
-            },
-            "targetTriple": {
-              "type": "string",
-              "description": "Triplet of the target architecture to override value derived from the program file."
-            },
-            "platformName": {
-              "type": "string",
-              "description": "Name of the execution platform to override value derived from the program file."
-            },
-            "initCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+                  {
+                    "type": "array",
+                    "description": "Specify an array of path remappings; each element must itself be a two element array containing a source and destination path name. Overrides sourcePath.",
+                    "items": {
+                      "type": "array",
+                      "minItems": 2,
+                      "maxItems": 2,
+                      "items": {
+                        "type": "string"
+                      }
+                    },
+                    "default": []
+                  }
+                ]
               },
-              "description": "Initialization commands executed upon debugger startup.",
-              "default": []
-            },
-            "preRunCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "debuggerRoot": {
+                "type": "string",
+                "description": "Specify a working directory to set the debug adapter to so relative object files can be located."
               },
-              "description": "Commands executed just before the program is launched.",
-              "default": []
-            },
-            "postRunCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "targetTriple": {
+                "type": "string",
+                "description": "Triplet of the target architecture to override value derived from the program file."
               },
-              "description": "Commands executed just as soon as the program is successfully launched when it's in a stopped state prior to any automatic continuation.",
-              "default": []
-            },
-            "launchCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "platformName": {
+                "type": "string",
+                "description": "Name of the execution platform to override value derived from the program file."
               },
-              "description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail. Launch the process with \"process launch -s\" to make the process to at the entry point since lldb-dap will auto resume if necessary.",
-              "default": []
-            },
-            "stopCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "initCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Initialization commands executed upon debugger startup.",
+                "default": []
               },
-              "description": "Commands executed each time the program stops.",
-              "default": []
-            },
-            "exitCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "preRunCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed just before the program is launched.",
+                "default": []
               },
-              "description": "Commands executed when the program exits.",
-              "default": []
-            },
-            "terminateCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "postRunCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed just as soon as the program is successfully launched when it's in a stopped state prior to any automatic continuation.",
+                "default": []
               },
-              "description": "Commands executed when the debugging session ends.",
-              "default": []
-            },
-            "runInTerminal": {
-              "type": "boolean",
-              "description": "Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs",
-              "default": false
-            },
-            "timeout": {
-              "type": "number",
-              "description": "The time in seconds to wait for a program to stop at entry point when launching with \"launchCommands\". Defaults to 30 seconds."
-            },
-            "enableAutoVariableSummaries": {
-              "type": "boolean",
-              "description": "Enable auto generated summaries for variables when no summaries exist for a given type. This feature can cause performance delays in large projects when viewing variables.",
-              "default": false
-            },
-            "displayExtendedBacktrace": {
-              "type": "boolean",
-              "description": "Enable language specific extended backtraces.",
-              "default": false
-            },
-            "enableSyntheticChildDebugging": {
-              "type": "boolean",
-              "description": "If a variable is displayed using a synthetic children, also display the actual contents of the variable at the end under a [raw] entry. This is useful when creating sythetic child plug-ins as it lets you see the actual contents of the variable.",
-              "default": false
-            },
-            "commandEscapePrefix": {
-              "type": "string",
-              "description": "The escape prefix to use for executing regular LLDB commands in the Debug Console, instead of printing variables. Defaults to a back-tick (`). If it's an empty string, then all expression in the Debug Console are treated as regular LLDB commands.",
-              "default": "`"
-            },
-            "customFrameFormat": {
-              "type": "string",
-              "description": "If non-empty, stack frames will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for frames. If the format string contains errors, an error message will be displayed on the Debug Console and the default frame names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
-              "default": ""
-            },
-            "customThreadFormat": {
-              "type": "string",
-              "description": "If non-empty, threads will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for threads. If the format string contains errors, an error message will be displayed on the Debug Console and the default thread names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
-              "default": ""
+              "launchCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail. Launch the process with \"process launch -s\" to make the process to at the entry point since lldb-dap will auto resume if necessary.",
+                "default": []
+              },
+              "stopCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed each time the program stops.",
+                "default": []
+              },
+              "exitCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed when the program exits.",
+                "default": []
+              },
+              "terminateCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed when the debugging session ends.",
+                "default": []
+              },
+              "runInTerminal": {
+                "type": "boolean",
+                "description": "Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs",
+                "default": false
+              },
+              "timeout": {
+                "type": "number",
+                "description": "The time in seconds to wait for a program to stop at entry point when launching with \"launchCommands\". Defaults to 30 seconds."
+              },
+              "enableAutoVariableSummaries": {
+                "type": "boolean",
+                "description": "Enable auto generated summaries for variables when no summaries exist for a given type. This feature can cause performance delays in large projects when viewing variables.",
+                "default": false
+              },
+              "displayExtendedBacktrace": {
+                "type": "boolean",
+                "description": "Enable language specific extended backtraces.",
+                "default": false
+              },
+              "enableSyntheticChildDebugging": {
+                "type": "boolean",
+                "description": "If a variable is displayed using a synthetic children, also display the actual contents of the variable at the end under a [raw] entry. This is useful when creating sythetic child plug-ins as it lets you see the actual contents of the variable.",
+                "default": false
+              },
+              "commandEscapePrefix": {
+                "type": "string",
+                "description": "The escape prefix to use for executing regular LLDB commands in the Debug Console, instead of printing variables. Defaults to a back-tick (`). If it's an empty string, then all expression in the Debug Console are treated as regular LLDB commands.",
+                "default": "`"
+              },
+              "customFrameFormat": {
+                "type": "string",
+                "description": "If non-empty, stack frames will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for frames. If the format string contains errors, an error message will be displayed on the Debug Console and the default frame names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
+                "default": ""
+              },
+              "customThreadFormat": {
+                "type": "string",
+                "description": "If non-empty, threads will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for threads. If the format string contains errors, an error message will be displayed on the Debug Console and the default thread names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
+                "default": ""
+              }
             }
-          }
-        },
-        "attach": {
-          "properties": {
-            "debugAdapterHostname": {
-              "type": "string",
-              "markdownDescription": "The hostname that an existing lldb-dap executable is listening on."
-            },
-            "debugAdapterPort": {
-              "type": "number",
-              "markdownDescription": "The port that an existing lldb-dap executable is listening on."
-            },
-            "debugAdapterExecutable": {
-              "type": "string",
-              "markdownDescription": "The absolute path to the LLDB debug adapter executable to use. Overrides any user or workspace settings."
-            },
-            "debugAdapterArgs": {
-              "type": "array",
-              "items": {
-                "type": "string"
+          },
+          "attach": {
+            "properties": {
+              "debugAdapterHostname": {
+                "type": "string",
+                "markdownDescription": "The hostname that an existing lldb-dap executable is listening on."
               },
-              "markdownDescription": "The list of additional arguments used to launch the debug adapter executable. Overrides any user or workspace settings."
-            },
-            "program": {
-              "type": "string",
-              "description": "Path to the program to attach to."
-            },
-            "pid": {
-              "type": [
-                "number",
-                "string"
-              ],
-              "description": "System process ID to attach to."
-            },
-            "waitFor": {
-              "type": "boolean",
-              "description": "If set to true, then wait for the process to launch by looking for a process with a basename that matches `program`. No process ID needs to be specified when using this flag.",
-              "default": true
-            },
-            "sourcePath": {
-              "type": "string",
-              "description": "Specify a source path to remap \"./\" to allow full paths to be used when setting breakpoints in binaries that have relative source paths."
-            },
-            "sourceMap": {
-              "anyOf": [
-                {
-                  "type": "object",
-                  "description": "Specify an object of path remappings; each entry has a key containing the source path and a value containing the destination path. E.g `{ \"/the/source/path\": \"/the/destination/path\" }`. Overrides sourcePath.",
-                  "patternProperties": {
-                    ".*": {
-                      "type": "string"
-                    }
-                  },
-                  "default": {}
+              "debugAdapterPort": {
+                "type": "number",
+                "markdownDescription": "The port that an existing lldb-dap executable is listening on."
+              },
+              "debugAdapterExecutable": {
+                "type": "string",
+                "markdownDescription": "The absolute path to the LLDB debug adapter executable to use. Overrides any user or workspace settings."
+              },
+              "debugAdapterArgs": {
+                "type": "array",
+                "items": {
+                  "type": "string"
                 },
-                {
-                  "type": "array",
-                  "description": "Specify an array of path remappings; each element must itself be a two element array containing a source and destination path name. Overrides sourcePath.",
-                  "items": {
+                "markdownDescription": "The list of additional arguments used to launch the debug adapter executable. Overrides any user or workspace settings."
+              },
+              "program": {
+                "type": "string",
+                "description": "Path to the program to attach to."
+              },
+              "pid": {
+                "type": [
+                  "number",
+                  "string"
+                ],
+                "description": "System process ID to attach to."
+              },
+              "waitFor": {
+                "type": "boolean",
+                "description": "If set to true, then wait for the process to launch by looking for a process with a basename that matches `program`. No process ID needs to be specified when using this flag.",
+                "default": true
+              },
+              "sourcePath": {
+                "type": "string",
+                "description": "Specify a source path to remap \"./\" to allow full paths to be used when setting breakpoints in binaries that have relative source paths."
+              },
+              "sourceMap": {
+                "anyOf": [
+                  {
+                    "type": "object",
+                    "description": "Specify an object of path remappings; each entry has a key containing the source path and a value containing the destination path. E.g `{ \"/the/source/path\": \"/the/destination/path\" }`. Overrides sourcePath.",
+                    "patternProperties": {
+                      ".*": {
+                        "type": "string"
+                      }
+                    },
+                    "default": {}
+                  },
+                  {
                     "type": "array",
-                    "minItems": 2,
-                    "maxItems": 2,
+                    "description": "Specify an array of path remappings; each element must itself be a two element array containing a source and destination path name. Overrides sourcePath.",
                     "items": {
-                      "type": "string"
-                    }
-                  },
-                  "default": []
-                }
-              ]
-            },
-            "debuggerRoot": {
-              "type": "string",
-              "description": "Specify a working directory to set the debug adapter to so relative object files can be located."
-            },
-            "targetTriple": {
-              "type": "string",
-              "description": "Triplet of the target architecture to override value derived from the program file."
-            },
-            "platformName": {
-              "type": "string",
-              "description": "Name of the execution platform to override value derived from the program file."
-            },
-            "attachCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+                      "type": "array",
+                      "minItems": 2,
+                      "maxItems": 2,
+                      "items": {
+                        "type": "string"
+                      }
+                    },
+                    "default": []
+                  }
+                ]
               },
-              "description": "Custom commands that are executed instead of attaching to a process ID or to a process by name. These commands may optionally create a new target and must perform an attach. A valid process must exist after these commands complete or the \"attach\" will fail.",
-              "default": []
-            },
-            "initCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "debuggerRoot": {
+                "type": "string",
+                "description": "Specify a working directory to set the debug adapter to so relative object files can be located."
               },
-              "description": "Initialization commands executed upon debugger startup.",
-              "default": []
-            },
-            "preRunCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "targetTriple": {
+                "type": "string",
+                "description": "Triplet of the target architecture to override value derived from the program file."
               },
-              "description": "Commands executed just before the program is attached to.",
-              "default": []
-            },
-            "postRunCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "platformName": {
+                "type": "string",
+                "description": "Name of the execution platform to override value derived from the program file."
               },
-              "description": "Commands executed just as soon as the program is successfully attached when it's in a stopped state prior to any automatic continuation.",
-              "default": []
-            },
-            "stopCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "attachCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Custom commands that are executed instead of attaching to a process ID or to a process by name. These commands may optionally create a new target and must perform an attach. A valid process must exist after these commands complete or the \"attach\" will fail.",
+                "default": []
               },
-              "description": "Commands executed each time the program stops.",
-              "default": []
-            },
-            "exitCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "initCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Initialization commands executed upon debugger startup.",
+                "default": []
               },
-              "description": "Commands executed when the program exits.",
-              "default": []
-            },
-            "terminateCommands": {
-              "type": "array",
-              "items": {
-                "type": "string"
+              "preRunCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed just before the program is attached to.",
+                "default": []
               },
-              "description": "Commands executed when the debugging session ends.",
-              "default": []
-            },
-            "coreFile": {
-              "type": "string",
-              "description": "Path to the core file to debug."
-            },
-            "timeout": {
-              "type": "number",
-              "description": "The time in seconds to wait for a program to stop when attaching using \"attachCommands\". Defaults to 30 seconds."
-            },
-            "gdb-remote-port": {
-              "type": [
-                "number",
-                "string"
-              ],
-              "description": "TCP/IP port to attach to a remote system. Specifying both pid and port is an error."
-            },
-            "gdb-remote-hostname": {
-              "type": "string",
-              "description": "The hostname to connect to a remote system. The default hostname being used localhost."
-            },
-            "enableAutoVariableSummaries": {
-              "type": "boolean",
-              "description": "Enable auto generated summaries for variables when no summaries exist for a given type. This feature can cause performance delays in large projects when viewing variables.",
-              "default": false
-            },
-            "displayExtendedBacktrace": {
-              "type": "boolean",
-              "description": "Enable language specific extended backtraces.",
-              "default": false
-            },
-            "enableSyntheticChildDebugging": {
-              "type": "boolean",
-              "description": "If a variable is displayed using a synthetic children, also display the actual contents of the variable at the end under a [raw] entry. This is useful when creating sythetic child plug-ins as it lets you see the actual contents of the variable.",
-              "default": false
-            },
-            "commandEscapePrefix": {
-              "type": "string",
-              "description": "The escape prefix character to use for executing regular LLDB commands in the Debug Console, instead of printing variables. Defaults to a back-tick (`). If empty, then all expression in the Debug Console are treated as regular LLDB commands.",
-              "default": "`"
-            },
-            "customFrameFormat": {
-              "type": "string",
-              "description": "If non-empty, stack frames will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for frames. If the format string contains errors, an error message will be displayed on the Debug Console and the default frame names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
-              "default": ""
-            },
-            "customThreadFormat": {
-              "type": "string",
-              "description": "If non-empty, threads will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for threads. If the format string contains errors, an error message will be displayed on the Debug Console and the default thread names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
-              "default": ""
+              "postRunCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed just as soon as the program is successfully attached when it's in a stopped state prior to any automatic continuation.",
+                "default": []
+              },
+              "stopCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed each time the program stops.",
+                "default": []
+              },
+              "exitCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed when the program exits.",
+                "default": []
+              },
+              "terminateCommands": {
+                "type": "array",
+                "items": {
+                  "type": "string"
+                },
+                "description": "Commands executed when the debugging session ends.",
+                "default": []
+              },
+              "coreFile": {
+                "type": "string",
+                "description": "Path to the core file to debug."
+              },
+              "timeout": {
+                "type": "number",
+                "description": "The time in seconds to wait for a program to stop when attaching using \"attachCommands\". Defaults to 30 seconds."
+              },
+              "gdb-remote-port": {
+                "type": [
+                  "number",
+                  "string"
+                ],
+                "description": "TCP/IP port to attach to a remote system. Specifying both pid and port is an error."
+              },
+              "gdb-remote-hostname": {
+                "type": "string",
+                "description": "The hostname to connect to a remote system. The default hostname being used localhost."
+              },
+              "enableAutoVariableSummaries": {
+                "type": "boolean",
+                "description": "Enable auto generated summaries for variables when no summaries exist for a given type. This feature can cause performance delays in large projects when viewing variables.",
+                "default": false
+              },
+              "displayExtendedBacktrace": {
+                "type": "boolean",
+                "description": "Enable language specific extended backtraces.",
+                "default": false
+              },
+              "enableSyntheticChildDebugging": {
+                "type": "boolean",
+                "description": "If a variable is displayed using a synthetic children, also display the actual contents of the variable at the end under a [raw] entry. This is useful when creating sythetic child plug-ins as it lets you see the actual contents of the variable.",
+                "default": false
+              },
+              "commandEscapePrefix": {
+                "type": "string",
+                "description": "The escape prefix character to use for executing regular LLDB commands in the Debug Console, instead of printing variables. Defaults to a back-tick (`). If empty, then all expression in the Debug Console are treated as regular LLDB commands.",
+                "default": "`"
+              },
+              "customFrameFormat": {
+                "type": "string",
+                "description": "If non-empty, stack frames will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for frames. If the format string contains errors, an error message will be displayed on the Debug Console and the default frame names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
+                "default": ""
+              },
+              "customThreadFormat": {
+                "type": "string",
+                "description": "If non-empty, threads will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for threads. If the format string contains errors, an error message will be displayed on the Debug Console and the default thread names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.",
+                "default": ""
+              }
             }
           }
-        }
-      },
-      "initialConfigurations": [
-        {
-          "type": "lldb-dap",
-          "request": "launch",
-          "name": "Debug",
-          "program": "${workspaceRoot}/<your program>",
-          "args": [],
-          "env": [],
-          "cwd": "${workspaceRoot}"
-        }
-      ],
-      "configurationSnippets": [
-        {
-          "label": "LLDB: Launch",
-          "description": "",
-          "body": {
+        },
+        "initialConfigurations": [
+          {
             "type": "lldb-dap",
             "request": "launch",
-            "name": "${2:Launch}",
-            "program": "^\"\\${workspaceRoot}/${1:<your program>}\"",
+            "name": "Debug",
+            "program": "${workspaceRoot}/<your program>",
             "args": [],
             "env": [],
-            "cwd": "^\"\\${workspaceRoot}\""
-          }
-        },
-        {
-          "label": "LLDB: Attach",
-          "description": "",
-          "body": {
-            "type": "lldb-dap",
-            "request": "attach",
-            "name": "${2:Attach}",
-            "program": "${1:<your program>}",
-            "waitFor": true
+            "cwd": "${workspaceRoot}"
           }
-        },
-        {
-          "label": "LLDB: Load Coredump",
-          "description": "",
-          "body": {
-            "type": "lldb-dap",
-            "request": "attach",
-            "name": "${2:Core}",
-            "program": "${1:<your program>}",
-            "coreFile": "${1:<your program>}.core"
+        ],
+        "configurationSnippets": [
+          {
+            "label": "LLDB: Launch",
+            "description": "",
+            "body": {
+              "type": "lldb-dap",
+              "request": "launch",
+              "name": "${2:Launch}",
+              "program": "^\"\\${workspaceRoot}/${1:<your program>}\"",
+              "args": [],
+              "env": [],
+              "cwd": "^\"\\${workspaceRoot}\""
+            }
+          },
+          {
+            "label": "LLDB: Attach",
+            "description": "",
+            "body": {
+              "type": "lldb-dap",
+              "request": "attach",
+              "name": "${2:Attach}",
+              "program": "${1:<your program>}",
+              "waitFor": true
+            }
+          },
+          {
+            "label": "LLDB: Load Coredump",
+            "description": "",
+            "body": {
+              "type": "lldb-dap",
+              "request": "attach",
+              "name": "${2:Core}",
+              "program": "${1:<your program>}",
+              "coreFile": "${1:<your program>}.core"
+            }
           }
-        }
-      ]
-    }
-  ]
-}
-\ No newline at end of file
+        ]
+      }
+    ]
+  }
+}
diff --git a/lldb/tools/lldb-dap/src-ts/debug-configuration-provider.ts b/lldb/tools/lldb-dap/src-ts/debug-configuration-provider.ts
index 8a40890..c91b101 100644
--- a/lldb/tools/lldb-dap/src-ts/debug-configuration-provider.ts
+++ b/lldb/tools/lldb-dap/src-ts/debug-configuration-provider.ts
@@ -21,79 +21,97 @@ async function isServerModeSupported(exe: string): Promise<boolean> {
 }
 
 interface BoolConfig {
-  type: 'boolean';
+  type: "boolean";
   default: boolean;
 }
 interface StringConfig {
-  type: 'string';
+  type: "string";
   default: string;
 }
 interface NumberConfig {
-  type: 'number';
+  type: "number";
   default: number;
 }
 interface StringArrayConfig {
-  type: 'stringArray';
+  type: "stringArray";
   default: string[];
 }
-type DefaultConfig = BoolConfig | NumberConfig | StringConfig | StringArrayConfig;
+type DefaultConfig =
+  | BoolConfig
+  | NumberConfig
+  | StringConfig
+  | StringArrayConfig;
 
 const configurations: Record<string, DefaultConfig> = {
   // Keys for debugger configurations.
-  "commandEscapePrefix": { type: "string", default: "`" },
-  "customFrameFormat": { type: "string", default: "" },
-  "customThreadFormat": { type: "string", default: "" },
-  "detachOnError": { type: "boolean", default: false },
-  "disableASLR": { type: "boolean", default: true },
-  "disableSTDIO": { type: "boolean", default: false },
-  "displayExtendedBacktrace": { type: "boolean", default: false },
-  "enableAutoVariableSummaries": { type: "boolean", default: false },
-  "enableSyntheticChildDebugging": { type: "boolean", default: false },
-  "timeout": { type: "number", default: 30 },
+  commandEscapePrefix: { type: "string", default: "`" },
+  customFrameFormat: { type: "string", default: "" },
+  customThreadFormat: { type: "string", default: "" },
+  detachOnError: { type: "boolean", default: false },
+  disableASLR: { type: "boolean", default: true },
+  disableSTDIO: { type: "boolean", default: false },
+  displayExtendedBacktrace: { type: "boolean", default: false },
+  enableAutoVariableSummaries: { type: "boolean", default: false },
+  enableSyntheticChildDebugging: { type: "boolean", default: false },
+  timeout: { type: "number", default: 30 },
 
   // Keys for platform / target configuration.
-  "platformName": { type: "string", default: "" },
-  "targetTriple": { type: "string", default: "" },
+  platformName: { type: "string", default: "" },
+  targetTriple: { type: "string", default: "" },
 
   // Keys for debugger command hooks.
-  "initCommands": { type: "stringArray", default: [] },
-  "preRunCommands": { type: "stringArray", default: [] },
-  "postRunCommands": { type: "stringArray", default: [] },
-  "stopCommands": { type: "stringArray", default: [] },
-  "exitCommands": { type: "stringArray", default: [] },
-  "terminateCommands": { type: "stringArray", default: [] },
+  initCommands: { type: "stringArray", default: [] },
+  preRunCommands: { type: "stringArray", default: [] },
+  postRunCommands: { type: "stringArray", default: [] },
+  stopCommands: { type: "stringArray", default: [] },
+  exitCommands: { type: "stringArray", default: [] },
+  terminateCommands: { type: "stringArray", default: [] },
 };
 
 export class LLDBDapConfigurationProvider
-  implements vscode.DebugConfigurationProvider {
-  constructor(private readonly server: LLDBDapServer) { }
+  implements vscode.DebugConfigurationProvider
+{
+  constructor(private readonly server: LLDBDapServer) {}
 
   async resolveDebugConfiguration(
     folder: vscode.WorkspaceFolder | undefined,
     debugConfiguration: vscode.DebugConfiguration,
-    token?: vscode.CancellationToken): Promise<vscode.DebugConfiguration> {
-    let config = vscode.workspace.getConfiguration('lldb-dap.defaults');
+    token?: vscode.CancellationToken,
+  ): Promise<vscode.DebugConfiguration> {
+    let config = vscode.workspace.getConfiguration("lldb-dap.defaults");
     for (const [key, cfg] of Object.entries(configurations)) {
-      if (Reflect.has(debugConfiguration, key)) continue;
+      if (Reflect.has(debugConfiguration, key)) {
+        continue;
+      }
       const value = config.get(key);
-      if (value === cfg.default) continue;
+      if (!value || value === cfg.default) {
+        continue;
+      }
       switch (cfg.type) {
-        case 'string':
-          if (typeof value !== 'string')
+        case "string":
+          if (typeof value !== "string") {
             throw new Error(`Expected ${key} to be a string, got ${value}`);
+          }
           break;
-        case 'number':
-          if (typeof value !== 'number')
+        case "number":
+          if (typeof value !== "number") {
             throw new Error(`Expected ${key} to be a number, got ${value}`);
+          }
           break;
-        case 'boolean':
-          if (typeof value !== 'boolean')
+        case "boolean":
+          if (typeof value !== "boolean") {
             throw new Error(`Expected ${key} to be a boolean, got ${value}`);
+          }
           break;
-        case 'stringArray':
-          if (typeof value !== 'object' && Array.isArray(value))
-            throw new Error(`Expected ${key} to be a array of strings, got ${value}`);
-          if ((value as string[]).length === 0) continue;
+        case "stringArray":
+          if (typeof value !== "object" && Array.isArray(value)) {
+            throw new Error(
+              `Expected ${key} to be a array of strings, got ${value}`,
+            );
+          }
+          if ((value as string[]).length === 0) {
+            continue;
+          }
           break;
       }
 
diff --git a/lldb/tools/lldb-dap/src-ts/uri-launch-handler.ts b/lldb/tools/lldb-dap/src-ts/uri-launch-handler.ts
index 0c3b1e9..d45c182 100644
--- a/lldb/tools/lldb-dap/src-ts/uri-launch-handler.ts
+++ b/lldb/tools/lldb-dap/src-ts/uri-launch-handler.ts
@@ -1,78 +1,102 @@
 import * as vscode from "vscode";
 
 export class LaunchUriHandler implements vscode.UriHandler {
-    async handleUri(uri: vscode.Uri) {
-        try {
-            const params = new URLSearchParams(uri.query);
-            if (uri.path == '/start') {
-                // Some properties have default values
-                let debugConfig: vscode.DebugConfiguration = {
-                    type: 'lldb-dap',
-                    request: 'launch',
-                    name: '',
-                };
-                // The `config` parameter allows providing a complete JSON-encoded configuration
-                const configJson = params.get("config");
-                if (configJson !== null) {
-                    Object.assign(debugConfig, JSON.parse(configJson));
-                }
-                // Furthermore, some frequently used parameters can also be provided as separate parameters
-                const stringKeys = ["name", "request", "program", "cwd", "debuggerRoot"];
-                const numberKeys = ["pid"];
-                const arrayKeys = [
-                    "args", "initCommands", "preRunCommands", "stopCommands", "exitCommands",
-                    "terminateCommands", "launchCommands", "attachCommands"
-                ];
-                for (const key of stringKeys) {
-                    const value = params.get(key);
-                    if (value) {
-                        debugConfig[key] = value;
-                    }
-                }
-                for (const key of numberKeys) {
-                    const value = params.get(key);
-                    if (value) {
-                        debugConfig[key] = Number(value);
-                    }
-                }
-                for (const key of arrayKeys) {
-                    // `getAll()` returns an array of strings.
-                    const value = params.getAll(key);
-                    if (value) {
-                        debugConfig[key] = value;
-                    }
-                }
-                // Report an error if we received any unknown parameters
-                const supportedKeys = new Set<string>(["config"].concat(stringKeys).concat(numberKeys).concat(arrayKeys));
-                const presentKeys = new Set<string>(params.keys());
-                // FIXME: Use `Set.difference` as soon as ES2024 is widely available
-                const unknownKeys = new Set<string>();
-                for (const k of presentKeys.keys()) {
-                    if (!supportedKeys.has(k)) {
-                        unknownKeys.add(k);
-                    }
-                }
-                if (unknownKeys.size > 0) {
-                    throw new Error(`Unsupported URL parameters: ${Array.from(unknownKeys.keys()).join(", ")}`);
-                }
-                // Prodide a default for the config name
-                const defaultName = debugConfig.request == 'launch' ? "URL-based Launch" : "URL-based Attach";
-                debugConfig.name = debugConfig.name || debugConfig.program || defaultName;
-                // Force the type to `lldb-dap`. We don't want to allow launching any other
-                // Debug Adapters using this URI scheme.
-                if (debugConfig.type != "lldb-dap") {
-                    throw new Error(`Unsupported debugger type: ${debugConfig.type}`);
-                }
-                await vscode.debug.startDebugging(undefined, debugConfig);
-            } else {
-                throw new Error(`Unsupported Uri path: ${uri.path}`);
-            }
-        } catch (err) {
-            if (err instanceof Error) {
-                await vscode.window.showErrorMessage(`Failed to handle lldb-dap URI request: ${err.message}`);
-            } else {
-                await vscode.window.showErrorMessage(`Failed to handle lldb-dap URI request: ${JSON.stringify(err)}`);
-            }
+  async handleUri(uri: vscode.Uri) {
+    try {
+      const params = new URLSearchParams(uri.query);
+      if (uri.path == "/start") {
+        // Some properties have default values
+        let debugConfig: vscode.DebugConfiguration = {
+          type: "lldb-dap",
+          request: "launch",
+          name: "",
+        };
+        // The `config` parameter allows providing a complete JSON-encoded configuration
+        const configJson = params.get("config");
+        if (configJson !== null) {
+          Object.assign(debugConfig, JSON.parse(configJson));
         }
+        // Furthermore, some frequently used parameters can also be provided as separate parameters
+        const stringKeys = [
+          "name",
+          "request",
+          "program",
+          "cwd",
+          "debuggerRoot",
+        ];
+        const numberKeys = ["pid"];
+        const arrayKeys = [
+          "args",
+          "initCommands",
+          "preRunCommands",
+          "stopCommands",
+          "exitCommands",
+          "terminateCommands",
+          "launchCommands",
+          "attachCommands",
+        ];
+        for (const key of stringKeys) {
+          const value = params.get(key);
+          if (value) {
+            debugConfig[key] = value;
+          }
+        }
+        for (const key of numberKeys) {
+          const value = params.get(key);
+          if (value) {
+            debugConfig[key] = Number(value);
+          }
+        }
+        for (const key of arrayKeys) {
+          // `getAll()` returns an array of strings.
+          const value = params.getAll(key);
+          if (value) {
+            debugConfig[key] = value;
+          }
+        }
+        // Report an error if we received any unknown parameters
+        const supportedKeys = new Set<string>(
+          ["config"].concat(stringKeys).concat(numberKeys).concat(arrayKeys),
+        );
+        const presentKeys = new Set<string>(params.keys());
+        // FIXME: Use `Set.difference` as soon as ES2024 is widely available
+        const unknownKeys = new Set<string>();
+        for (const k of presentKeys.keys()) {
+          if (!supportedKeys.has(k)) {
+            unknownKeys.add(k);
+          }
+        }
+        if (unknownKeys.size > 0) {
+          throw new Error(
+            `Unsupported URL parameters: ${Array.from(unknownKeys.keys()).join(", ")}`,
+          );
+        }
+        // Prodide a default for the config name
+        const defaultName =
+          debugConfig.request == "launch"
+            ? "URL-based Launch"
+            : "URL-based Attach";
+        debugConfig.name =
+          debugConfig.name || debugConfig.program || defaultName;
+        // Force the type to `lldb-dap`. We don't want to allow launching any other
+        // Debug Adapters using this URI scheme.
+        if (debugConfig.type != "lldb-dap") {
+          throw new Error(`Unsupported debugger type: ${debugConfig.type}`);
+        }
+        await vscode.debug.startDebugging(undefined, debugConfig);
+      } else {
+        throw new Error(`Unsupported Uri path: ${uri.path}`);
+      }
+    } catch (err) {
+      if (err instanceof Error) {
+        await vscode.window.showErrorMessage(
+          `Failed to handle lldb-dap URI request: ${err.message}`,
+        );
+      } else {
+        await vscode.window.showErrorMessage(
+          `Failed to handle lldb-dap URI request: ${JSON.stringify(err)}`,
+        );
+      }
     }
+  }
 }
diff --git a/lldb/unittests/API/CMakeLists.txt b/lldb/unittests/API/CMakeLists.txt
index 8bdc806..06ac492 100644
--- a/lldb/unittests/API/CMakeLists.txt
+++ b/lldb/unittests/API/CMakeLists.txt
@@ -16,6 +16,17 @@ if (CXX_SUPPORTS_DOCUMENTATION)
     PRIVATE -Wdocumentation)
 endif()
 
+# Apply -Wno-documentation-deprecated-sync while we migrate away from
+# report_fatal_error in llvm/include/llvm/Support/ErrorHandling.h
+# and llvm/include/llvm/Support/Error.h.
+# Remove this block of code when the migration is complete.
+# See https://github.com/llvm/llvm-project/issues/138914.
+check_cxx_compiler_flag("-Wno-documentation-deprecated-sync"
+                        CXX_SUPPORTS_NO_DOCUMENTATION_DEPRECATED_SYNC)
+if (CXX_SUPPORTS_NO_DOCUMENTATION_DEPRECATED_SYNC)
+  target_compile_options(APITests
+    PRIVATE -Wno-documentation-deprecated-sync)
+endif()
 
 if(Python3_RPATH)
   set_property(TARGET APITests APPEND PROPERTY BUILD_RPATH "${Python3_RPATH}")
diff --git a/lldb/unittests/Host/HostTest.cpp b/lldb/unittests/Host/HostTest.cpp
index 222de62..9306a86 100644
--- a/lldb/unittests/Host/HostTest.cpp
+++ b/lldb/unittests/Host/HostTest.cpp
@@ -107,6 +107,10 @@ TEST(Host, LaunchProcessDuplicatesHandle) {
   Pipe pipe;
   ASSERT_THAT_ERROR(pipe.CreateNew(/*child_process_inherit=*/false).takeError(),
                     llvm::Succeeded());
+  SCOPED_TRACE(llvm::formatv("Pipe handles are: {0}/{1}",
+                             (uint64_t)pipe.GetReadPipe(),
+                             (uint64_t)pipe.GetWritePipe())
+                   .str());
   ProcessLaunchInfo info;
   info.SetExecutableFile(FileSpec(TestMainArgv0),
                          /*add_exe_file_as_first_arg=*/true);
diff --git a/lldb/unittests/Host/posix/HostTest.cpp b/lldb/unittests/Host/posix/HostTest.cpp
index 5d50de3..082edcc 100644
--- a/lldb/unittests/Host/posix/HostTest.cpp
+++ b/lldb/unittests/Host/posix/HostTest.cpp
@@ -115,5 +115,8 @@ TEST_F(HostTest, GetProcessInfoSetsPriority) {
   }
   ASSERT_TRUE(Info.IsZombie().has_value());
   ASSERT_FALSE(Info.IsZombie().value());
+
+  ASSERT_TRUE(Info.IsCoreDumping().has_value());
+  ASSERT_FALSE(Info.IsCoreDumping().value());
 }
 #endif
diff --git a/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp b/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp
index 86a6cf0..c1dcab0 100644
--- a/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp
+++ b/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 #include "Plugins/ObjectFile/ELF/ObjectFileELF.h"
@@ -236,12 +237,12 @@ Symbols:
       ConstString(symbol), eSymbolTypeAny);
   ASSERT_NE(nullptr, sym);
 
-  UnwindPlan plan(eRegisterKindGeneric);
-  ASSERT_TRUE(cfi.GetUnwindPlan(sym->GetAddress(), plan));
-  ASSERT_EQ(3, plan.GetRowCount());
-  EXPECT_EQ(GetExpectedRow0(), *plan.GetRowAtIndex(0));
-  EXPECT_EQ(GetExpectedRow1(), *plan.GetRowAtIndex(1));
-  EXPECT_EQ(GetExpectedRow2(), *plan.GetRowAtIndex(2));
+  std::unique_ptr<UnwindPlan> plan_up = cfi.GetUnwindPlan(sym->GetAddress());
+  ASSERT_TRUE(plan_up);
+  ASSERT_EQ(3, plan_up->GetRowCount());
+  EXPECT_THAT(plan_up->GetRowAtIndex(0), testing::Pointee(GetExpectedRow0()));
+  EXPECT_THAT(plan_up->GetRowAtIndex(1), testing::Pointee(GetExpectedRow1()));
+  EXPECT_THAT(plan_up->GetRowAtIndex(2), testing::Pointee(GetExpectedRow2()));
 }
 
 TEST_F(DWARFCallFrameInfoTest, Basic_dwarf3) {
diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml
index c205faf..3031283 100644
--- a/llvm/bindings/ocaml/llvm/llvm.ml
+++ b/llvm/bindings/ocaml/llvm/llvm.ml
@@ -302,6 +302,8 @@ module AtomicRMWBinOp = struct
   | UDec_Wrap
   | USub_Cond
   | USub_Sat
+  | FMaximum
+  | FMinimum
 end
 
 module ValueKind = struct
diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index e203961..460be4f 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -337,6 +337,8 @@ module AtomicRMWBinOp : sig
   | UDec_Wrap
   | USub_Cond
   | USub_Sat
+  | FMaximum
+  | FMinimum
 end
 
 (** The kind of an [llvalue], the result of [classify_value v].
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 8b3303f..c427a65e 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -882,6 +882,11 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
   # The LLVM libraries have no stable C++ API, so -Wnoexcept-type is not useful.
   append("-Wno-noexcept-type" CMAKE_CXX_FLAGS)
 
+  # LLVM has a policy of including virtual "anchor" functions to control
+  # where the vtable is emitted. In `final` classes, these are exactly what
+  # this warning detects: unnecessary virtual methods.
+  add_flag_if_supported("-Wno-unnecessary-virtual-specifier" CXX_SUPPORTS_UNNECESSARY_VIRTUAL_FLAG)
+
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     append("-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
   endif()
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 0531836..0ed16755 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -56,10 +56,6 @@ Makes programs 10x faster by doing Special New Thing.
 Changes to the LLVM IR
 ----------------------
 
-* It is no longer permitted to inspect the uses of ConstantData. Use
-  count APIs will behave as if they have no uses (i.e. use_empty() is
-  always true).
-
 * The `nocapture` attribute has been replaced by `captures(none)`.
 * The constant expression variants of the following instructions have been
   removed:
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 02d58d8..ba33c49 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -2234,12 +2234,12 @@ inline const APInt &umax(const APInt &A, const APInt &B) {
 }
 
 /// Determine the absolute difference of two APInts considered to be signed.
-inline const APInt abds(const APInt &A, const APInt &B) {
+inline APInt abds(const APInt &A, const APInt &B) {
   return A.sge(B) ? (A - B) : (B - A);
 }
 
 /// Determine the absolute difference of two APInts considered to be unsigned.
-inline const APInt abdu(const APInt &A, const APInt &B) {
+inline APInt abdu(const APInt &A, const APInt &B) {
   return A.uge(B) ? (A - B) : (B - A);
 }
 
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index bb99a41..3175b3e 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ADL.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/EpochTracker.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
@@ -96,6 +97,24 @@ public:
     return makeConstIterator(getBucketsEnd(), getBucketsEnd(), *this, true);
   }
 
+  // Return an iterator to iterate over keys in the map.
+  inline auto keys() {
+    return map_range(*this, [](const BucketT &P) { return P.getFirst(); });
+  }
+
+  // Return an iterator to iterate over values in the map.
+  inline auto values() {
+    return map_range(*this, [](const BucketT &P) { return P.getSecond(); });
+  }
+
+  inline auto keys() const {
+    return map_range(*this, [](const BucketT &P) { return P.getFirst(); });
+  }
+
+  inline auto values() const {
+    return map_range(*this, [](const BucketT &P) { return P.getSecond(); });
+  }
+
   [[nodiscard]] bool empty() const { return getNumEntries() == 0; }
   unsigned size() const { return getNumEntries(); }
 
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index b3b44a5..d23b818 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -1013,6 +1013,18 @@ struct ExternalAAWrapperPass : ImmutablePass {
 
   explicit ExternalAAWrapperPass(CallbackT CB);
 
+  /// Returns whether this external AA should run before Basic AA.
+  ///
+  /// By default, external AA passes are run after Basic AA. If this returns
+  /// true, the external AA will be run before Basic AA during alias analysis.
+  ///
+  /// For some targets, we prefer to run the external AA early to improve
+  /// compile time as it has more target-specific information. This is
+  /// particularly useful when the external AA can provide more precise results
+  /// than Basic AA so that Basic AA does not need to spend time recomputing
+  /// them.
+  virtual bool runEarly() { return false; }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index 2631c3c..3f62981 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -375,7 +375,7 @@ public:
 
   const ResourceBinding &getBinding() const { return Binding; }
   TargetExtType *getHandleTy() const { return HandleTy; }
-  const StringRef getName() const { return Symbol ? Symbol->getName() : ""; }
+  StringRef getName() const { return Symbol ? Symbol->getName() : ""; }
 
   bool hasSymbol() const { return Symbol; }
   GlobalVariable *createSymbol(Module &M, StructType *Ty, StringRef Name = "");
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 33f2ab4..4c23eaa 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -127,7 +127,7 @@ public:
     NoLibrary,        // Don't use any vector library.
     Accelerate,       // Use Accelerate framework.
     DarwinLibSystemM, // Use Darwin's libsystem_m.
-    LIBMVEC_X86,      // GLIBC Vector Math library.
+    LIBMVEC,          // GLIBC Vector Math library.
     MASSV,            // IBM MASS vector library.
     SVML,             // Intel short vector math library.
     SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 79d2398..9132a0a 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -240,7 +240,8 @@ private:
   bool DbgInfoAvailable = false;
 
 protected:
-  explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
+  AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer,
+             char &ID = AsmPrinter::ID);
 
 public:
   ~AsmPrinter() override;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 654112e..6c4f036 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -61,7 +61,7 @@ bool GIMatchTableExecutor::executeMatchTable(
   // Bypass the flag check on the instruction, and only look at the MCInstrDesc.
   bool NoFPException = !State.MIs[0]->getDesc().mayRaiseFPException();
 
-  const uint16_t Flags = State.MIs[0]->getFlags();
+  const uint32_t Flags = State.MIs[0]->getFlags();
 
   enum RejectAction { RejectAndGiveUp, RejectAndResume };
   auto handleReject = [&]() -> RejectAction {
@@ -80,7 +80,7 @@ bool GIMatchTableExecutor::executeMatchTable(
     for (auto MIB : OutMIs) {
       // Set the NoFPExcept flag when no original matched instruction could
       // raise an FP exception, but the new instruction potentially might.
-      uint16_t MIBFlags = Flags | MIB.getInstr()->getFlags();
+      uint32_t MIBFlags = Flags | MIB.getInstr()->getFlags();
       if (NoFPException && MIB->mayRaiseFPException())
         MIBFlags |= MachineInstr::NoFPExcept;
       if (Observer)
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index bc00d0b..1660fe6 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1196,6 +1196,9 @@ protected:
   const MachineSchedContext *Context;
   const TargetSchedModel *SchedModel = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
+  unsigned TopIdx = 0;
+  unsigned BotIdx = 0;
+  unsigned NumRegionInstrs = 0;
 
   MachineSchedPolicy RegionPolicy;
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index abe2617..03099e9 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3216,8 +3216,7 @@ public:
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
   /// \p DeinterleaveRes is a list of deinterleaved results.
-  virtual bool
-  lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
+  virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
                                       ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
@@ -3228,9 +3227,8 @@ public:
   /// \p Store is the vp.store instruction.
   /// \p Mask is a mask value
   /// \p InterleaveOps is a list of values being interleaved.
-  virtual bool
-  lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
-                                     ArrayRef<Value *> InterleaveOps) const {
+  virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
+                                       ArrayRef<Value *> InterleaveOps) const {
     return false;
   }
 
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
index efe8182..629e0d8 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
@@ -20,9 +20,6 @@ namespace llvm::jitlink::i386 {
 /// Represets i386 fixups
 enum EdgeKind_i386 : Edge::Kind {
 
-  /// None
-  None = Edge::FirstRelocation,
-
   /// A plain 32-bit pointer value relocation.
   ///
   /// Fixup expression:
@@ -32,7 +29,7 @@ enum EdgeKind_i386 : Edge::Kind {
   ///   - The target must reside in the low 32-bits of the address space,
   ///     otherwise an out-of-range error will be returned.
   ///
-  Pointer32,
+  Pointer32 = Edge::FirstRelocation,
 
   /// A 32-bit PC-relative relocation.
   ///
@@ -192,10 +189,6 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   auto FixupAddress = B.getAddress() + E.getOffset();
 
   switch (E.getKind()) {
-  case i386::None: {
-    break;
-  }
-
   case i386::Pointer32: {
     uint32_t Value = E.getTarget().getAddress().getValue() + E.getAddend();
     *(ulittle32_t *)FixupPtr = Value;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 3743b03..583718a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -43,6 +43,7 @@ def OMPC_AcqRel : Clause<"acq_rel"> {
   let clangClass = "OMPAcqRelClause";
 }
 def OMPC_AdjustArgs : Clause<"adjust_args"> {
+  let flangClass = "OmpAdjustArgsClause";
 }
 def OMPC_Affinity : Clause<"affinity"> {
   let clangClass = "OMPAffinityClause";
@@ -65,6 +66,7 @@ def OMPC_Allocator : Clause<"allocator"> {
   let flangClass = "ScalarIntExpr";
 }
 def OMPC_AppendArgs : Clause<"append_args"> {
+  let flangClass = "OmpAppendArgsClause";
 }
 def OMPC_At : Clause<"at"> {
   let clangClass = "OMPAtClause";
@@ -721,10 +723,10 @@ def OMP_EndDeclareTarget : Directive<"end declare target"> {
 }
 def OMP_DeclareVariant : Directive<"declare variant"> {
   let allowedClauses = [
-    VersionedClause<OMPC_Match>,
-  ];
-  let allowedExclusiveClauses = [
     VersionedClause<OMPC_AdjustArgs, 51>,
+  ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Match>,
     VersionedClause<OMPC_AppendArgs, 51>,
   ];
   let association = AS_Declaration;
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 76efa9b..88d005d 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -50,9 +50,6 @@ template <class ConstantClass> struct ConstantAggrKeyType;
 /// These constants have no operands; they represent their data directly.
 /// Since they can be in use by unrelated modules (and are never based on
 /// GlobalValues), it never makes sense to RAUW them.
-///
-/// These do not have use lists. It is illegal to inspect the uses. These behave
-/// as if they have no uses (i.e. use_empty() is always true).
 class ConstantData : public Constant {
   constexpr static IntrusiveOperandsAllocMarker AllocMarker{0};
 
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 0f6a206..d82c69a 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -2284,6 +2284,13 @@ public:
 #endif
   }
 
+  const DILocation *getWithoutAtom() const {
+    if (!getAtomGroup() && !getAtomRank())
+      return this;
+    return get(getContext(), getLine(), getColumn(), getScope(), getInlinedAt(),
+               isImplicitCode());
+  }
+
   // Disallow replacing operands.
   void replaceOperandWith(unsigned I, Metadata *New) = delete;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 99cb557..7da11b93 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1705,12 +1705,23 @@ let TargetPrefix = "riscv" in {
 
   // Segment loads/stores for fixed vectors.
   foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
+    // Input: (pointer, vl)
     def int_riscv_seg # nf # _load
           : DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                               !add(nf, -1))),
                                   [llvm_anyptr_ty, llvm_anyint_ty],
                                   [NoCapture<ArgIndex<0>>, IntrReadMem]>;
+    // Input: (pointer, mask, vl)
+    def int_riscv_seg # nf # _load_mask
+          : DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
+                                              !listsplat(LLVMMatchType<0>,
+                                              !add(nf, -1))),
+                                  [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_anyint_ty],
+                                  [NoCapture<ArgIndex<0>>, IntrReadMem]>;
+
+    // Input: (<stored values>, pointer, vl)
     def int_riscv_seg # nf # _store
           : DefaultAttrsIntrinsic<[],
                                   !listconcat([llvm_anyvector_ty],
@@ -1718,6 +1729,15 @@ let TargetPrefix = "riscv" in {
                                                           !add(nf, -1)),
                                               [llvm_anyptr_ty, llvm_anyint_ty]),
                                   [NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
+    // Input: (<stored values>, pointer, mask, vl)
+    def int_riscv_seg # nf # _store_mask
+          : DefaultAttrsIntrinsic<[],
+                                  !listconcat([llvm_anyvector_ty],
+                                              !listsplat(LLVMMatchType<0>,
+                                                          !add(nf, -1)),
+                                              [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                               llvm_anyint_ty]),
+                                  [NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
   }
 
 } // TargetPrefix = "riscv"
diff --git a/llvm/include/llvm/IR/Use.h b/llvm/include/llvm/IR/Use.h
index 0d5d878..a86b9c4 100644
--- a/llvm/include/llvm/IR/Use.h
+++ b/llvm/include/llvm/IR/Use.h
@@ -42,7 +42,10 @@ public:
 
 private:
   /// Destructor - Only for zap()
-  ~Use() { removeFromList(); }
+  ~Use() {
+    if (Val)
+      removeFromList();
+  }
 
   /// Constructor
   Use(User *Parent) : Parent(Parent) {}
@@ -93,15 +96,9 @@ private:
   }
 
   void removeFromList() {
-    if (Prev) {
-      *Prev = Next;
-      if (Next) {
-        Next->Prev = Prev;
-        Next = nullptr;
-      }
-
-      Prev = nullptr;
-    }
+    *Prev = Next;
+    if (Next)
+      Next->Prev = Prev;
   }
 };
 
diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h
index 241b9e2..bf1de7e 100644
--- a/llvm/include/llvm/IR/Value.h
+++ b/llvm/include/llvm/IR/Value.h
@@ -116,7 +116,7 @@ protected:
 
 private:
   Type *VTy;
-  Use *UseList = nullptr;
+  Use *UseList;
 
   friend class ValueAsMetadata; // Allow access to IsUsedByMD.
   friend class ValueHandleBase; // Allow access to HasValueHandle.
@@ -339,25 +339,20 @@ public:
 #endif
   }
 
-  /// Check if this Value has a use-list.
-  bool hasUseList() const { return !isa<ConstantData>(this); }
-
   bool use_empty() const {
     assertModuleIsMaterialized();
     return UseList == nullptr;
   }
 
-  bool materialized_use_empty() const { return UseList == nullptr; }
+  bool materialized_use_empty() const {
+    return UseList == nullptr;
+  }
 
   using use_iterator = use_iterator_impl<Use>;
   using const_use_iterator = use_iterator_impl<const Use>;
 
-  use_iterator materialized_use_begin() {
-    assert(hasUseList());
-    return use_iterator(UseList);
-  }
+  use_iterator materialized_use_begin() { return use_iterator(UseList); }
   const_use_iterator materialized_use_begin() const {
-    assert(hasUseList());
     return const_use_iterator(UseList);
   }
   use_iterator use_begin() {
@@ -385,17 +380,16 @@ public:
     return materialized_uses();
   }
 
-  bool user_empty() const { return use_empty(); }
+  bool user_empty() const {
+    assertModuleIsMaterialized();
+    return UseList == nullptr;
+  }
 
   using user_iterator = user_iterator_impl<User>;
   using const_user_iterator = user_iterator_impl<const User>;
 
-  user_iterator materialized_user_begin() {
-    assert(hasUseList());
-    return user_iterator(UseList);
-  }
+  user_iterator materialized_user_begin() { return user_iterator(UseList); }
   const_user_iterator materialized_user_begin() const {
-    assert(hasUseList());
     return const_user_iterator(UseList);
   }
   user_iterator user_begin() {
@@ -435,7 +429,7 @@ public:
   ///
   /// This is specialized because it is a common request and does not require
   /// traversing the whole use list.
-  bool hasOneUse() const { return UseList && hasSingleElement(uses()); }
+  bool hasOneUse() const { return hasSingleElement(uses()); }
 
   /// Return true if this Value has exactly N uses.
   bool hasNUses(unsigned N) const;
@@ -497,8 +491,6 @@ public:
   static void dropDroppableUse(Use &U);
 
   /// Check if this value is used in the specified basic block.
-  ///
-  /// Not supported for ConstantData.
   bool isUsedInBasicBlock(const BasicBlock *BB) const;
 
   /// This method computes the number of uses of this Value.
@@ -508,10 +500,7 @@ public:
   unsigned getNumUses() const;
 
   /// This method should only be used by the Use class.
-  void addUse(Use &U) {
-    if (UseList || hasUseList())
-      U.addToList(&UseList);
-  }
+  void addUse(Use &U) { U.addToList(&UseList); }
 
   /// Concrete subclass of this.
   ///
@@ -899,10 +888,9 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Value &V) {
 }
 
 void Use::set(Value *V) {
-  removeFromList();
+  if (Val) removeFromList();
   Val = V;
-  if (V)
-    V->addUse(*this);
+  if (V) V->addUse(*this);
 }
 
 Value *Use::operator=(Value *RHS) {
diff --git a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
index c7f098b..c94ae94 100644
--- a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
+++ b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h
@@ -508,8 +508,10 @@ public:
   virtual bool equalIsAsmAssignment() { return true; };
   // Return whether this start of statement identifier is a label
   virtual bool isLabel(AsmToken &Token) { return true; };
-  // Return whether this parser accept star as start of statement
-  virtual bool starIsStartOfStatement() { return false; };
+  // Return whether this parser accepts the given token as start of statement.
+  virtual bool tokenIsStartOfStatement(AsmToken::TokenKind Token) {
+    return false;
+  }
 
   virtual const MCExpr *applySpecifier(const MCExpr *E, uint32_t,
                                        MCContext &Ctx) {
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index fd1f055..dc14038 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -510,7 +510,7 @@ public:
     return iterator_range(It->second);
   }
 
-  const ArrayRef<MCDecodedPseudoProbeInlineTree> getInlineTreeVec() const {
+  ArrayRef<MCDecodedPseudoProbeInlineTree> getInlineTreeVec() const {
     return InlineTreeVec;
   }
 
diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h
index 85daaee..d66993b5 100644
--- a/llvm/include/llvm/Support/ErrorHandling.h
+++ b/llvm/include/llvm/Support/ErrorHandling.h
@@ -17,47 +17,46 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-  class StringRef;
-  class Twine;
-
-  /// An error handler callback.
-  typedef void (*fatal_error_handler_t)(void *user_data,
-                                        const char *reason,
-                                        bool gen_crash_diag);
-
-  /// install_fatal_error_handler - Installs a new error handler to be used
-  /// whenever a serious (non-recoverable) error is encountered by LLVM.
-  ///
-  /// If no error handler is installed the default is to print the error message
-  /// to stderr, and call exit(1).  If an error handler is installed then it is
-  /// the handler's responsibility to log the message, it will no longer be
-  /// printed to stderr.  If the error handler returns, then exit(1) will be
-  /// called.
-  ///
-  /// It is dangerous to naively use an error handler which throws an exception.
-  /// Even though some applications desire to gracefully recover from arbitrary
-  /// faults, blindly throwing exceptions through unfamiliar code isn't a way to
-  /// achieve this.
-  ///
-  /// \param user_data - An argument which will be passed to the install error
-  /// handler.
-  void install_fatal_error_handler(fatal_error_handler_t handler,
-                                   void *user_data = nullptr);
-
-  /// Restores default error handling behaviour.
-  void remove_fatal_error_handler();
-
-  /// ScopedFatalErrorHandler - This is a simple helper class which just
-  /// calls install_fatal_error_handler in its constructor and
-  /// remove_fatal_error_handler in its destructor.
-  struct ScopedFatalErrorHandler {
-    explicit ScopedFatalErrorHandler(fatal_error_handler_t handler,
-                                     void *user_data = nullptr) {
-      install_fatal_error_handler(handler, user_data);
-    }
-
-    ~ScopedFatalErrorHandler() { remove_fatal_error_handler(); }
-  };
+class StringRef;
+class Twine;
+
+/// An error handler callback.
+typedef void (*fatal_error_handler_t)(void *user_data, const char *reason,
+                                      bool gen_crash_diag);
+
+/// install_fatal_error_handler - Installs a new error handler to be used
+/// whenever a serious (non-recoverable) error is encountered by LLVM.
+///
+/// If no error handler is installed the default is to print the error message
+/// to stderr, and call exit(1).  If an error handler is installed then it is
+/// the handler's responsibility to log the message, it will no longer be
+/// printed to stderr.  If the error handler returns, then exit(1) will be
+/// called.
+///
+/// It is dangerous to naively use an error handler which throws an exception.
+/// Even though some applications desire to gracefully recover from arbitrary
+/// faults, blindly throwing exceptions through unfamiliar code isn't a way to
+/// achieve this.
+///
+/// \param user_data - An argument which will be passed to the install error
+/// handler.
+void install_fatal_error_handler(fatal_error_handler_t handler,
+                                 void *user_data = nullptr);
+
+/// Restores default error handling behaviour.
+void remove_fatal_error_handler();
+
+/// ScopedFatalErrorHandler - This is a simple helper class which just
+/// calls install_fatal_error_handler in its constructor and
+/// remove_fatal_error_handler in its destructor.
+struct ScopedFatalErrorHandler {
+  explicit ScopedFatalErrorHandler(fatal_error_handler_t handler,
+                                   void *user_data = nullptr) {
+    install_fatal_error_handler(handler, user_data);
+  }
+
+  ~ScopedFatalErrorHandler() { remove_fatal_error_handler(); }
+};
 
 /// @deprecated Use reportFatalInternalError() or reportFatalUsageError()
 /// instead.
@@ -139,10 +138,10 @@ void install_out_of_memory_new_handler();
 /// This function calls abort(), and prints the optional message to stderr.
 /// Use the llvm_unreachable macro (that adds location info), instead of
 /// calling this function directly.
-[[noreturn]] void
-llvm_unreachable_internal(const char *msg = nullptr, const char *file = nullptr,
-                          unsigned line = 0);
-}
+[[noreturn]] void llvm_unreachable_internal(const char *msg = nullptr,
+                                            const char *file = nullptr,
+                                            unsigned line = 0);
+} // namespace llvm
 
 /// Marks that the current location is not supposed to be reachable.
 /// In !NDEBUG builds, prints the message and location info to stderr.
@@ -162,7 +161,7 @@ llvm_unreachable_internal(const char *msg = nullptr, const char *file = nullptr,
 /// diagnostics for unreachable code paths, and allows compilers to omit
 /// unnecessary code.
 #ifndef NDEBUG
-#define llvm_unreachable(msg) \
+#define llvm_unreachable(msg)                                                  \
   ::llvm::llvm_unreachable_internal(msg, __FILE__, __LINE__)
 #elif !defined(LLVM_BUILTIN_UNREACHABLE)
 #define llvm_unreachable(msg) ::llvm::llvm_unreachable_internal()
diff --git a/llvm/include/llvm/Support/FileUtilities.h b/llvm/include/llvm/Support/FileUtilities.h
index 9707724..c5a8457 100644
--- a/llvm/include/llvm/Support/FileUtilities.h
+++ b/llvm/include/llvm/Support/FileUtilities.h
@@ -22,81 +22,79 @@
 
 namespace llvm {
 
-  /// DiffFilesWithTolerance - Compare the two files specified, returning 0 if
-  /// the files match, 1 if they are different, and 2 if there is a file error.
-  /// This function allows you to specify an absolute and relative FP error that
-  /// is allowed to exist.  If you specify a string to fill in for the error
-  /// option, it will set the string to an error message if an error occurs, or
-  /// if the files are different.
-  ///
-  int DiffFilesWithTolerance(StringRef FileA,
-                             StringRef FileB,
-                             double AbsTol, double RelTol,
-                             std::string *Error = nullptr);
-
-
-  /// FileRemover - This class is a simple object meant to be stack allocated.
-  /// If an exception is thrown from a region, the object removes the filename
-  /// specified (if deleteIt is true).
-  ///
-  class FileRemover {
-    SmallString<128> Filename;
-    bool DeleteIt;
-  public:
-    FileRemover() : DeleteIt(false) {}
-
-    explicit FileRemover(const Twine& filename, bool deleteIt = true)
+/// DiffFilesWithTolerance - Compare the two files specified, returning 0 if
+/// the files match, 1 if they are different, and 2 if there is a file error.
+/// This function allows you to specify an absolute and relative FP error that
+/// is allowed to exist.  If you specify a string to fill in for the error
+/// option, it will set the string to an error message if an error occurs, or
+/// if the files are different.
+///
+int DiffFilesWithTolerance(StringRef FileA, StringRef FileB, double AbsTol,
+                           double RelTol, std::string *Error = nullptr);
+
+/// FileRemover - This class is a simple object meant to be stack allocated.
+/// If an exception is thrown from a region, the object removes the filename
+/// specified (if deleteIt is true).
+///
+class FileRemover {
+  SmallString<128> Filename;
+  bool DeleteIt;
+
+public:
+  FileRemover() : DeleteIt(false) {}
+
+  explicit FileRemover(const Twine &filename, bool deleteIt = true)
       : DeleteIt(deleteIt) {
-      filename.toVector(Filename);
-    }
+    filename.toVector(Filename);
+  }
 
-    ~FileRemover() {
-      if (DeleteIt) {
-        // Ignore problems deleting the file.
-        sys::fs::remove(Filename);
-      }
+  ~FileRemover() {
+    if (DeleteIt) {
+      // Ignore problems deleting the file.
+      sys::fs::remove(Filename);
     }
-
-    /// setFile - Give ownership of the file to the FileRemover so it will
-    /// be removed when the object is destroyed.  If the FileRemover already
-    /// had ownership of a file, remove it first.
-    void setFile(const Twine& filename, bool deleteIt = true) {
-      if (DeleteIt) {
-        // Ignore problems deleting the file.
-        sys::fs::remove(Filename);
-      }
-
-      Filename.clear();
-      filename.toVector(Filename);
-      DeleteIt = deleteIt;
+  }
+
+  /// setFile - Give ownership of the file to the FileRemover so it will
+  /// be removed when the object is destroyed.  If the FileRemover already
+  /// had ownership of a file, remove it first.
+  void setFile(const Twine &filename, bool deleteIt = true) {
+    if (DeleteIt) {
+      // Ignore problems deleting the file.
+      sys::fs::remove(Filename);
     }
 
-    /// releaseFile - Take ownership of the file away from the FileRemover so it
-    /// will not be removed when the object is destroyed.
-    void releaseFile() { DeleteIt = false; }
-  };
-
-  /// FilePermssionsApplier helps to copy permissions from an input file to
-  /// an output one. It memorizes the status of the input file and can apply
-  /// permissions and dates to the output file.
-  class FilePermissionsApplier {
-  public:
-    static Expected<FilePermissionsApplier> create(StringRef InputFilename);
-
-    /// Apply stored permissions to the \p OutputFilename.
-    /// Copy LastAccess and ModificationTime if \p CopyDates is true.
-    /// Overwrite stored permissions if \p OverwritePermissions is specified.
-    Error
-    apply(StringRef OutputFilename, bool CopyDates = false,
-          std::optional<sys::fs::perms> OverwritePermissions = std::nullopt);
-
-  private:
-    FilePermissionsApplier(StringRef InputFilename, sys::fs::file_status Status)
-        : InputFilename(InputFilename), InputStatus(Status) {}
-
-    StringRef InputFilename;
-    sys::fs::file_status InputStatus;
-  };
-} // End llvm namespace
+    Filename.clear();
+    filename.toVector(Filename);
+    DeleteIt = deleteIt;
+  }
+
+  /// releaseFile - Take ownership of the file away from the FileRemover so it
+  /// will not be removed when the object is destroyed.
+  void releaseFile() { DeleteIt = false; }
+};
+
+/// FilePermssionsApplier helps to copy permissions from an input file to
+/// an output one. It memorizes the status of the input file and can apply
+/// permissions and dates to the output file.
+class FilePermissionsApplier {
+public:
+  static Expected<FilePermissionsApplier> create(StringRef InputFilename);
+
+  /// Apply stored permissions to the \p OutputFilename.
+  /// Copy LastAccess and ModificationTime if \p CopyDates is true.
+  /// Overwrite stored permissions if \p OverwritePermissions is specified.
+  Error
+  apply(StringRef OutputFilename, bool CopyDates = false,
+        std::optional<sys::fs::perms> OverwritePermissions = std::nullopt);
+
+private:
+  FilePermissionsApplier(StringRef InputFilename, sys::fs::file_status Status)
+      : InputFilename(InputFilename), InputStatus(Status) {}
+
+  StringRef InputFilename;
+  sys::fs::file_status InputStatus;
+};
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h
index 9df94eb..7ef532d 100644
--- a/llvm/include/llvm/Support/Program.h
+++ b/llvm/include/llvm/Support/Program.h
@@ -1,4 +1,4 @@
-//===- llvm/Support/Program.h ------------------------------------*- C++ -*-===//
+//===- llvm/Support/Program.h -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -26,222 +26,222 @@ namespace llvm {
 class BitVector;
 namespace sys {
 
-  /// This is the OS-specific separator for PATH like environment variables:
-  // a colon on Unix or a semicolon on Windows.
+/// This is the OS-specific separator for PATH like environment variables:
+// a colon on Unix or a semicolon on Windows.
 #if defined(LLVM_ON_UNIX)
-  const char EnvPathSeparator = ':';
-#elif defined (_WIN32)
-  const char EnvPathSeparator = ';';
+const char EnvPathSeparator = ':';
+#elif defined(_WIN32)
+const char EnvPathSeparator = ';';
 #endif
 
 #if defined(_WIN32)
-  typedef unsigned long procid_t; // Must match the type of DWORD on Windows.
-  typedef void *process_t;        // Must match the type of HANDLE on Windows.
+typedef unsigned long procid_t; // Must match the type of DWORD on Windows.
+typedef void *process_t;        // Must match the type of HANDLE on Windows.
 #else
-  typedef ::pid_t procid_t;
-  typedef procid_t process_t;
+typedef ::pid_t procid_t;
+typedef procid_t process_t;
 #endif
 
-  /// This struct encapsulates information about a process.
-  struct ProcessInfo {
-    enum : procid_t { InvalidPid = 0 };
-
-    procid_t Pid;      /// The process identifier.
-    process_t Process; /// Platform-dependent process object.
-
-    /// The return code, set after execution.
-    int ReturnCode;
-
-    ProcessInfo();
-  };
-
-  /// This struct encapsulates information about a process execution.
-  struct ProcessStatistics {
-    std::chrono::microseconds TotalTime;
-    std::chrono::microseconds UserTime;
-    uint64_t PeakMemory = 0; ///< Maximum resident set size in KiB.
-  };
-
-  /// Find the first executable file \p Name in \p Paths.
-  ///
-  /// This does not perform hashing as a shell would but instead stats each PATH
-  /// entry individually so should generally be avoided. Core LLVM library
-  /// functions and options should instead require fully specified paths.
-  ///
-  /// \param Name name of the executable to find. If it contains any system
-  ///   slashes, it will be returned as is.
-  /// \param Paths optional list of paths to search for \p Name. If empty it
-  ///   will use the system PATH environment instead.
-  ///
-  /// \returns The fully qualified path to the first \p Name in \p Paths if it
-  ///   exists. \p Name if \p Name has slashes in it. Otherwise an error.
-  ErrorOr<std::string>
-  findProgramByName(StringRef Name, ArrayRef<StringRef> Paths = {});
-
-  // These functions change the specified standard stream (stdin or stdout) mode
-  // based on the Flags. They return errc::success if the specified stream was
-  // changed. Otherwise, a platform dependent error is returned.
-  std::error_code ChangeStdinMode(fs::OpenFlags Flags);
-  std::error_code ChangeStdoutMode(fs::OpenFlags Flags);
-
-  // These functions change the specified standard stream (stdin or stdout) to
-  // binary mode. They return errc::success if the specified stream
-  // was changed. Otherwise a platform dependent error is returned.
-  std::error_code ChangeStdinToBinary();
-  std::error_code ChangeStdoutToBinary();
-
-  /// This function executes the program using the arguments provided.  The
-  /// invoked program will inherit the stdin, stdout, and stderr file
-  /// descriptors, the environment and other configuration settings of the
-  /// invoking program.
-  /// This function waits for the program to finish, so should be avoided in
-  /// library functions that aren't expected to block. Consider using
-  /// ExecuteNoWait() instead.
-  /// \returns an integer result code indicating the status of the program.
-  /// A zero or positive value indicates the result code of the program.
-  /// -1 indicates failure to execute
-  /// -2 indicates a crash during execution or timeout
-  int ExecuteAndWait(
-      StringRef Program, ///< Path of the program to be executed. It is
-      ///< presumed this is the result of the findProgramByName method.
-      ArrayRef<StringRef> Args, ///< An array of strings that are passed to the
-      ///< program.  The first element should be the name of the program.
-      ///< The array should **not** be terminated by an empty StringRef.
-      std::optional<ArrayRef<StringRef>> Env =
-          std::nullopt, ///< An optional vector of
-      ///< strings to use for the program's environment. If not provided, the
-      ///< current program's environment will be used.  If specified, the
-      ///< vector should **not** be terminated by an empty StringRef.
-      ArrayRef<std::optional<StringRef>> Redirects = {}, ///<
-      ///< An array of optional paths. Should have a size of zero or three.
-      ///< If the array is empty, no redirections are performed.
-      ///< Otherwise, the inferior process's stdin(0), stdout(1), and stderr(2)
-      ///< will be redirected to the corresponding paths, if the optional path
-      ///< is present (not \c std::nullopt).
-      ///< When an empty path is passed in, the corresponding file descriptor
-      ///< will be disconnected (ie, /dev/null'd) in a portable way.
-      unsigned SecondsToWait = 0, ///< If non-zero, this specifies the amount
-      ///< of time to wait for the child process to exit. If the time
-      ///< expires, the child is killed and this call returns. If zero,
-      ///< this function will wait until the child finishes or forever if
-      ///< it doesn't.
-      unsigned MemoryLimit = 0, ///< If non-zero, this specifies max. amount
-      ///< of memory can be allocated by process. If memory usage will be
-      ///< higher limit, the child is killed and this call returns. If zero
-      ///< - no memory limit.
-      std::string *ErrMsg = nullptr, ///< If non-zero, provides a pointer to a
-      ///< string instance in which error messages will be returned. If the
-      ///< string is non-empty upon return an error occurred while invoking the
-      ///< program.
-      bool *ExecutionFailed = nullptr,
-      std::optional<ProcessStatistics> *ProcStat = nullptr, ///< If non-zero,
-      /// provides a pointer to a structure in which process execution
-      /// statistics will be stored.
-      BitVector *AffinityMask = nullptr ///< CPUs or processors the new
-                                        /// program shall run on.
-  );
-
-  /// Similar to \ref ExecuteAndWait, but returns immediately.
-  /// \returns The \ref ProcessInfo of the newly launched process.
-  /// \note On Microsoft Windows systems, users will need to either call
-  /// \ref Wait until the process has finished executing or win32's CloseHandle
-  /// API on ProcessInfo.ProcessHandle to avoid memory leaks.
-  ProcessInfo ExecuteNoWait(
-      StringRef Program, ArrayRef<StringRef> Args,
-      std::optional<ArrayRef<StringRef>> Env,
-      ArrayRef<std::optional<StringRef>> Redirects = {},
-      unsigned MemoryLimit = 0, std::string *ErrMsg = nullptr,
-      bool *ExecutionFailed = nullptr, BitVector *AffinityMask = nullptr,
-      /// If true the executed program detatches from the controlling
-      /// terminal. I/O streams such as llvm::outs, llvm::errs, and stdin will
-      /// be closed until redirected to another output location
-      bool DetachProcess = false);
-
-  /// Return true if the given arguments fit within system-specific
-  /// argument length limits.
-  bool commandLineFitsWithinSystemLimits(StringRef Program,
-                                         ArrayRef<StringRef> Args);
-
-  /// Return true if the given arguments fit within system-specific
-  /// argument length limits.
-  bool commandLineFitsWithinSystemLimits(StringRef Program,
-                                         ArrayRef<const char *> Args);
-
-  /// File encoding options when writing contents that a non-UTF8 tool will
-  /// read (on Windows systems). For UNIX, we always use UTF-8.
-  enum WindowsEncodingMethod {
-    /// UTF-8 is the LLVM native encoding, being the same as "do not perform
-    /// encoding conversion".
-    WEM_UTF8,
-    WEM_CurrentCodePage,
-    WEM_UTF16
-  };
-
-  /// Saves the UTF8-encoded \p contents string into the file \p FileName
-  /// using a specific encoding.
-  ///
-  /// This write file function adds the possibility to choose which encoding
-  /// to use when writing a text file. On Windows, this is important when
-  /// writing files with internationalization support with an encoding that is
-  /// different from the one used in LLVM (UTF-8). We use this when writing
-  /// response files, since GCC tools on MinGW only understand legacy code
-  /// pages, and VisualStudio tools only understand UTF-16.
-  /// For UNIX, using different encodings is silently ignored, since all tools
-  /// work well with UTF-8.
-  /// This function assumes that you only use UTF-8 *text* data and will convert
-  /// it to your desired encoding before writing to the file.
-  ///
-  /// FIXME: We use EM_CurrentCodePage to write response files for GNU tools in
-  /// a MinGW/MinGW-w64 environment, which has serious flaws but currently is
-  /// our best shot to make gcc/ld understand international characters. This
-  /// should be changed as soon as binutils fix this to support UTF16 on mingw.
-  ///
-  /// \returns non-zero error_code if failed
-  std::error_code
-  writeFileWithEncoding(StringRef FileName, StringRef Contents,
-                        WindowsEncodingMethod Encoding = WEM_UTF8);
-
-  /// This function waits for the process specified by \p PI to finish.
-  /// \returns A \see ProcessInfo struct with Pid set to:
-  /// \li The process id of the child process if the child process has changed
-  /// state.
-  /// \li 0 if the child process has not changed state.
-  /// \note Users of this function should always check the ReturnCode member of
-  /// the \see ProcessInfo returned from this function.
-  ProcessInfo
-  Wait(const ProcessInfo &PI, ///< The child process that should be waited on.
-       std::optional<unsigned> SecondsToWait, ///< If std::nullopt, waits until
-       ///< child has terminated.
-       ///< If a value, this specifies the amount of time to wait for the child
-       ///< process. If the time expires, and \p Polling is false, the child is
-       ///< killed and this < function returns. If the time expires and \p
-       ///< Polling is true, the child is resumed.
-       ///<
-       ///< If zero, this function will perform a non-blocking
-       ///< wait on the child process.
-       std::string *ErrMsg = nullptr, ///< If non-zero, provides a pointer to a
-       ///< string instance in which error messages will be returned. If the
-       ///< string is non-empty upon return an error occurred while invoking the
-       ///< program.
-       std::optional<ProcessStatistics> *ProcStat =
-           nullptr, ///< If non-zero, provides
-       /// a pointer to a structure in which process execution statistics will
-       /// be stored.
-
-       bool Polling = false ///< If true, do not kill the process on timeout.
-  );
-
-  /// Print a command argument, and optionally quote it.
-  void printArg(llvm::raw_ostream &OS, StringRef Arg, bool Quote);
+/// This struct encapsulates information about a process.
+struct ProcessInfo {
+  enum : procid_t { InvalidPid = 0 };
+
+  procid_t Pid;      /// The process identifier.
+  process_t Process; /// Platform-dependent process object.
+
+  /// The return code, set after execution.
+  int ReturnCode;
+
+  ProcessInfo();
+};
+
+/// This struct encapsulates information about a process execution.
+struct ProcessStatistics {
+  std::chrono::microseconds TotalTime;
+  std::chrono::microseconds UserTime;
+  uint64_t PeakMemory = 0; ///< Maximum resident set size in KiB.
+};
+
+/// Find the first executable file \p Name in \p Paths.
+///
+/// This does not perform hashing as a shell would but instead stats each PATH
+/// entry individually so should generally be avoided. Core LLVM library
+/// functions and options should instead require fully specified paths.
+///
+/// \param Name name of the executable to find. If it contains any system
+///   slashes, it will be returned as is.
+/// \param Paths optional list of paths to search for \p Name. If empty it
+///   will use the system PATH environment instead.
+///
+/// \returns The fully qualified path to the first \p Name in \p Paths if it
+///   exists. \p Name if \p Name has slashes in it. Otherwise an error.
+ErrorOr<std::string> findProgramByName(StringRef Name,
+                                       ArrayRef<StringRef> Paths = {});
+
+// These functions change the specified standard stream (stdin or stdout) mode
+// based on the Flags. They return errc::success if the specified stream was
+// changed. Otherwise, a platform dependent error is returned.
+std::error_code ChangeStdinMode(fs::OpenFlags Flags);
+std::error_code ChangeStdoutMode(fs::OpenFlags Flags);
+
+// These functions change the specified standard stream (stdin or stdout) to
+// binary mode. They return errc::success if the specified stream
+// was changed. Otherwise a platform dependent error is returned.
+std::error_code ChangeStdinToBinary();
+std::error_code ChangeStdoutToBinary();
+
+/// This function executes the program using the arguments provided.  The
+/// invoked program will inherit the stdin, stdout, and stderr file
+/// descriptors, the environment and other configuration settings of the
+/// invoking program.
+/// This function waits for the program to finish, so should be avoided in
+/// library functions that aren't expected to block. Consider using
+/// ExecuteNoWait() instead.
+/// \returns an integer result code indicating the status of the program.
+/// A zero or positive value indicates the result code of the program.
+/// -1 indicates failure to execute
+/// -2 indicates a crash during execution or timeout
+int ExecuteAndWait(
+    StringRef Program, ///< Path of the program to be executed. It is
+    ///< presumed this is the result of the findProgramByName method.
+    ArrayRef<StringRef> Args, ///< An array of strings that are passed to the
+    ///< program.  The first element should be the name of the program.
+    ///< The array should **not** be terminated by an empty StringRef.
+    std::optional<ArrayRef<StringRef>> Env =
+        std::nullopt, ///< An optional vector of
+    ///< strings to use for the program's environment. If not provided, the
+    ///< current program's environment will be used.  If specified, the
+    ///< vector should **not** be terminated by an empty StringRef.
+    ArrayRef<std::optional<StringRef>> Redirects = {}, ///<
+    ///< An array of optional paths. Should have a size of zero or three.
+    ///< If the array is empty, no redirections are performed.
+    ///< Otherwise, the inferior process's stdin(0), stdout(1), and stderr(2)
+    ///< will be redirected to the corresponding paths, if the optional path
+    ///< is present (not \c std::nullopt).
+    ///< When an empty path is passed in, the corresponding file descriptor
+    ///< will be disconnected (ie, /dev/null'd) in a portable way.
+    unsigned SecondsToWait = 0, ///< If non-zero, this specifies the amount
+    ///< of time to wait for the child process to exit. If the time
+    ///< expires, the child is killed and this call returns. If zero,
+    ///< this function will wait until the child finishes or forever if
+    ///< it doesn't.
+    unsigned MemoryLimit = 0, ///< If non-zero, this specifies max. amount
+    ///< of memory can be allocated by process. If memory usage will be
+    ///< higher limit, the child is killed and this call returns. If zero
+    ///< - no memory limit.
+    std::string *ErrMsg = nullptr, ///< If non-zero, provides a pointer to a
+    ///< string instance in which error messages will be returned. If the
+    ///< string is non-empty upon return an error occurred while invoking the
+    ///< program.
+    bool *ExecutionFailed = nullptr,
+    std::optional<ProcessStatistics> *ProcStat = nullptr, ///< If non-zero,
+    /// provides a pointer to a structure in which process execution
+    /// statistics will be stored.
+    BitVector *AffinityMask = nullptr ///< CPUs or processors the new
+                                      /// program shall run on.
+);
+
+/// Similar to \ref ExecuteAndWait, but returns immediately.
+/// \returns The \ref ProcessInfo of the newly launched process.
+/// \note On Microsoft Windows systems, users will need to either call
+/// \ref Wait until the process has finished executing or win32's CloseHandle
+/// API on ProcessInfo.ProcessHandle to avoid memory leaks.
+ProcessInfo ExecuteNoWait(
+    StringRef Program, ArrayRef<StringRef> Args,
+    std::optional<ArrayRef<StringRef>> Env,
+    ArrayRef<std::optional<StringRef>> Redirects = {}, unsigned MemoryLimit = 0,
+    std::string *ErrMsg = nullptr, bool *ExecutionFailed = nullptr,
+    BitVector *AffinityMask = nullptr,
+    /// If true the executed program detatches from the controlling
+    /// terminal. I/O streams such as llvm::outs, llvm::errs, and stdin will
+    /// be closed until redirected to another output location
+    bool DetachProcess = false);
+
+/// Return true if the given arguments fit within system-specific
+/// argument length limits.
+bool commandLineFitsWithinSystemLimits(StringRef Program,
+                                       ArrayRef<StringRef> Args);
+
+/// Return true if the given arguments fit within system-specific
+/// argument length limits.
+bool commandLineFitsWithinSystemLimits(StringRef Program,
+                                       ArrayRef<const char *> Args);
+
+/// File encoding options when writing contents that a non-UTF8 tool will
+/// read (on Windows systems). For UNIX, we always use UTF-8.
+enum WindowsEncodingMethod {
+  /// UTF-8 is the LLVM native encoding, being the same as "do not perform
+  /// encoding conversion".
+  WEM_UTF8,
+  WEM_CurrentCodePage,
+  WEM_UTF16
+};
+
+/// Saves the UTF8-encoded \p contents string into the file \p FileName
+/// using a specific encoding.
+///
+/// This write file function adds the possibility to choose which encoding
+/// to use when writing a text file. On Windows, this is important when
+/// writing files with internationalization support with an encoding that is
+/// different from the one used in LLVM (UTF-8). We use this when writing
+/// response files, since GCC tools on MinGW only understand legacy code
+/// pages, and VisualStudio tools only understand UTF-16.
+/// For UNIX, using different encodings is silently ignored, since all tools
+/// work well with UTF-8.
+/// This function assumes that you only use UTF-8 *text* data and will convert
+/// it to your desired encoding before writing to the file.
+///
+/// FIXME: We use EM_CurrentCodePage to write response files for GNU tools in
+/// a MinGW/MinGW-w64 environment, which has serious flaws but currently is
+/// our best shot to make gcc/ld understand international characters. This
+/// should be changed as soon as binutils fix this to support UTF16 on mingw.
+///
+/// \returns non-zero error_code if failed
+std::error_code
+writeFileWithEncoding(StringRef FileName, StringRef Contents,
+                      WindowsEncodingMethod Encoding = WEM_UTF8);
+
+/// This function waits for the process specified by \p PI to finish.
+/// \returns A \see ProcessInfo struct with Pid set to:
+/// \li The process id of the child process if the child process has changed
+/// state.
+/// \li 0 if the child process has not changed state.
+/// \note Users of this function should always check the ReturnCode member of
+/// the \see ProcessInfo returned from this function.
+ProcessInfo
+Wait(const ProcessInfo &PI, ///< The child process that should be waited on.
+     std::optional<unsigned> SecondsToWait, ///< If std::nullopt, waits until
+     ///< child has terminated.
+     ///< If a value, this specifies the amount of time to wait for the child
+     ///< process. If the time expires, and \p Polling is false, the child is
+     ///< killed and this < function returns. If the time expires and \p
+     ///< Polling is true, the child is resumed.
+     ///<
+     ///< If zero, this function will perform a non-blocking
+     ///< wait on the child process.
+     std::string *ErrMsg = nullptr, ///< If non-zero, provides a pointer to a
+     ///< string instance in which error messages will be returned. If the
+     ///< string is non-empty upon return an error occurred while invoking the
+     ///< program.
+     std::optional<ProcessStatistics> *ProcStat =
+         nullptr, ///< If non-zero, provides
+     /// a pointer to a structure in which process execution statistics will
+     /// be stored.
+
+     bool Polling = false ///< If true, do not kill the process on timeout.
+);
+
+/// Print a command argument, and optionally quote it.
+void printArg(llvm::raw_ostream &OS, StringRef Arg, bool Quote);
 
 #if defined(_WIN32)
-  /// Given a list of command line arguments, quote and escape them as necessary
-  /// to build a single flat command line appropriate for calling CreateProcess
-  /// on
-  /// Windows.
-  ErrorOr<std::wstring> flattenWindowsCommandLine(ArrayRef<StringRef> Args);
+/// Given a list of command line arguments, quote and escape them as necessary
+/// to build a single flat command line appropriate for calling CreateProcess
+/// on
+/// Windows.
+ErrorOr<std::wstring> flattenWindowsCommandLine(ArrayRef<StringRef> Args);
 #endif
-  }
-}
+} // namespace sys
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/Signals.h b/llvm/include/llvm/Support/Signals.h
index 70749ce..0a560e6 100644
--- a/llvm/include/llvm/Support/Signals.h
+++ b/llvm/include/llvm/Support/Signals.h
@@ -1,4 +1,4 @@
-//===- llvm/Support/Signals.h - Signal Handling support ----------*- C++ -*-===//
+//===- llvm/Support/Signals.h - Signal Handling support ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -23,107 +23,107 @@ class raw_ostream;
 
 namespace sys {
 
-  /// This function runs all the registered interrupt handlers, including the
-  /// removal of files registered by RemoveFileOnSignal.
-  void RunInterruptHandlers();
-
-  /// This function registers signal handlers to ensure that if a signal gets
-  /// delivered that the named file is removed.
-  /// Remove a file if a fatal signal occurs.
-  bool RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg = nullptr);
-
-  /// This function removes a file from the list of files to be removed on
-  /// signal delivery.
-  void DontRemoveFileOnSignal(StringRef Filename);
-
-  /// When an error signal (such as SIGABRT or SIGSEGV) is delivered to the
-  /// process, print a stack trace and then exit.
-  /// Print a stack trace if a fatal signal occurs.
-  /// \param Argv0 the current binary name, used to find the symbolizer
-  ///        relative to the current binary before searching $PATH; can be
-  ///        StringRef(), in which case we will only search $PATH.
-  /// \param DisableCrashReporting if \c true, disable the normal crash
-  ///        reporting mechanisms on the underlying operating system.
-  void PrintStackTraceOnErrorSignal(StringRef Argv0,
-                                    bool DisableCrashReporting = false);
-
-  /// Disable all system dialog boxes that appear when the process crashes.
-  void DisableSystemDialogsOnCrash();
-
-  /// Print the stack trace using the given \c raw_ostream object.
-  /// \param Depth refers to the number of stackframes to print. If not
-  ///        specified, the entire frame is printed.
-  void PrintStackTrace(raw_ostream &OS, int Depth = 0);
-
-  // Run all registered signal handlers.
-  void RunSignalHandlers();
-
-  using SignalHandlerCallback = void (*)(void *);
-
-  /// Add a function to be called when an abort/kill signal is delivered to the
-  /// process. The handler can have a cookie passed to it to identify what
-  /// instance of the handler it is.
-  void AddSignalHandler(SignalHandlerCallback FnPtr, void *Cookie);
-
-  /// This function registers a function to be called when the user "interrupts"
-  /// the program (typically by pressing ctrl-c).  When the user interrupts the
-  /// program, the specified interrupt function is called instead of the program
-  /// being killed, and the interrupt function automatically disabled.
-  ///
-  /// Note that interrupt functions are not allowed to call any non-reentrant
-  /// functions.  An null interrupt function pointer disables the current
-  /// installed function.  Note also that the handler may be executed on a
-  /// different thread on some platforms.
-  void SetInterruptFunction(void (*IF)());
-
-  /// Registers a function to be called when an "info" signal is delivered to
-  /// the process.
-  ///
-  /// On POSIX systems, this will be SIGUSR1; on systems that have it, SIGINFO
-  /// will also be used (typically ctrl-t).
-  ///
-  /// Note that signal handlers are not allowed to call any non-reentrant
-  /// functions.  An null function pointer disables the current installed
-  /// function.  Note also that the handler may be executed on a different
-  /// thread on some platforms.
-  void SetInfoSignalFunction(void (*Handler)());
-
-  /// Registers a function to be called in a "one-shot" manner when a pipe
-  /// signal is delivered to the process (i.e., on a failed write to a pipe).
-  /// After the pipe signal is handled once, the handler is unregistered.
-  ///
-  /// The LLVM signal handling code will not install any handler for the pipe
-  /// signal unless one is provided with this API (see \ref
-  /// DefaultOneShotPipeSignalHandler). This handler must be provided before
-  /// any other LLVM signal handlers are installed: the \ref InitLLVM
-  /// constructor has a flag that can simplify this setup.
-  ///
-  /// Note that the handler is not allowed to call any non-reentrant
-  /// functions.  A null handler pointer disables the current installed
-  /// function.  Note also that the handler may be executed on a
-  /// different thread on some platforms.
-  void SetOneShotPipeSignalFunction(void (*Handler)());
-
-  /// On Unix systems and Windows, this function exits with an "IO error" exit
-  /// code.
-  void DefaultOneShotPipeSignalHandler();
+/// This function runs all the registered interrupt handlers, including the
+/// removal of files registered by RemoveFileOnSignal.
+void RunInterruptHandlers();
+
+/// This function registers signal handlers to ensure that if a signal gets
+/// delivered that the named file is removed.
+/// Remove a file if a fatal signal occurs.
+bool RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg = nullptr);
+
+/// This function removes a file from the list of files to be removed on
+/// signal delivery.
+void DontRemoveFileOnSignal(StringRef Filename);
+
+/// When an error signal (such as SIGABRT or SIGSEGV) is delivered to the
+/// process, print a stack trace and then exit.
+/// Print a stack trace if a fatal signal occurs.
+/// \param Argv0 the current binary name, used to find the symbolizer
+///        relative to the current binary before searching $PATH; can be
+///        StringRef(), in which case we will only search $PATH.
+/// \param DisableCrashReporting if \c true, disable the normal crash
+///        reporting mechanisms on the underlying operating system.
+void PrintStackTraceOnErrorSignal(StringRef Argv0,
+                                  bool DisableCrashReporting = false);
+
+/// Disable all system dialog boxes that appear when the process crashes.
+void DisableSystemDialogsOnCrash();
+
+/// Print the stack trace using the given \c raw_ostream object.
+/// \param Depth refers to the number of stackframes to print. If not
+///        specified, the entire frame is printed.
+void PrintStackTrace(raw_ostream &OS, int Depth = 0);
+
+// Run all registered signal handlers.
+void RunSignalHandlers();
+
+using SignalHandlerCallback = void (*)(void *);
+
+/// Add a function to be called when an abort/kill signal is delivered to the
+/// process. The handler can have a cookie passed to it to identify what
+/// instance of the handler it is.
+void AddSignalHandler(SignalHandlerCallback FnPtr, void *Cookie);
+
+/// This function registers a function to be called when the user "interrupts"
+/// the program (typically by pressing ctrl-c).  When the user interrupts the
+/// program, the specified interrupt function is called instead of the program
+/// being killed, and the interrupt function automatically disabled.
+///
+/// Note that interrupt functions are not allowed to call any non-reentrant
+/// functions.  An null interrupt function pointer disables the current
+/// installed function.  Note also that the handler may be executed on a
+/// different thread on some platforms.
+void SetInterruptFunction(void (*IF)());
+
+/// Registers a function to be called when an "info" signal is delivered to
+/// the process.
+///
+/// On POSIX systems, this will be SIGUSR1; on systems that have it, SIGINFO
+/// will also be used (typically ctrl-t).
+///
+/// Note that signal handlers are not allowed to call any non-reentrant
+/// functions.  An null function pointer disables the current installed
+/// function.  Note also that the handler may be executed on a different
+/// thread on some platforms.
+void SetInfoSignalFunction(void (*Handler)());
+
+/// Registers a function to be called in a "one-shot" manner when a pipe
+/// signal is delivered to the process (i.e., on a failed write to a pipe).
+/// After the pipe signal is handled once, the handler is unregistered.
+///
+/// The LLVM signal handling code will not install any handler for the pipe
+/// signal unless one is provided with this API (see \ref
+/// DefaultOneShotPipeSignalHandler). This handler must be provided before
+/// any other LLVM signal handlers are installed: the \ref InitLLVM
+/// constructor has a flag that can simplify this setup.
+///
+/// Note that the handler is not allowed to call any non-reentrant
+/// functions.  A null handler pointer disables the current installed
+/// function.  Note also that the handler may be executed on a
+/// different thread on some platforms.
+void SetOneShotPipeSignalFunction(void (*Handler)());
+
+/// On Unix systems and Windows, this function exits with an "IO error" exit
+/// code.
+void DefaultOneShotPipeSignalHandler();
 
 #ifdef _WIN32
-  /// Windows does not support signals and this handler must be called manually.
-  void CallOneShotPipeSignalHandler();
+/// Windows does not support signals and this handler must be called manually.
+void CallOneShotPipeSignalHandler();
 #endif
 
-  /// This function does the following:
-  /// - clean up any temporary files registered with RemoveFileOnSignal()
-  /// - dump the callstack from the exception context
-  /// - call any relevant interrupt/signal handlers
-  /// - create a core/mini dump of the exception context whenever possible
-  /// Context is a system-specific failure context: it is the signal type on
-  /// Unix; the ExceptionContext on Windows.
-  void CleanupOnSignal(uintptr_t Context);
-
-  void unregisterHandlers();
-} // End sys namespace
-} // End llvm namespace
+/// This function does the following:
+/// - clean up any temporary files registered with RemoveFileOnSignal()
+/// - dump the callstack from the exception context
+/// - call any relevant interrupt/signal handlers
+/// - create a core/mini dump of the exception context whenever possible
+/// Context is a system-specific failure context: it is the signal type on
+/// Unix; the ExceptionContext on Windows.
+void CleanupOnSignal(uintptr_t Context);
+
+void unregisterHandlers();
+} // namespace sys
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 566e7db..9069267 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -396,6 +396,11 @@ public:
   // TODO: Populate all pass names by using <Target>PassRegistry.def.
   virtual void registerPassBuilderCallbacks(PassBuilder &) {}
 
+  /// Allow the target to register early alias analyses (AA before BasicAA) with
+  /// the AAManager for use with the new pass manager. Only affects the
+  /// "default" AAManager.
+  virtual void registerEarlyDefaultAliasAnalyses(AAManager &) {}
+
   /// Allow the target to register alias analyses with the AAManager for use
   /// with the new pass manager. Only affects the "default" AAManager.
   virtual void registerDefaultAliasAnalyses(AAManager &) {}
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index dae6cda..7fd5278 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -208,6 +208,7 @@ public:
     Linux,
     Lv2, // PS3
     MacOSX,
+    Managarm,
     NetBSD,
     OpenBSD,
     Solaris,
@@ -299,6 +300,7 @@ public:
     Amplification,
     OpenCL,
     OpenHOS,
+    Mlibc,
 
     PAuthTest,
 
@@ -846,6 +848,8 @@ public:
 
   bool isVulkanOS() const { return getOS() == Triple::Vulkan; }
 
+  bool isOSManagarm() const { return getOS() == Triple::Managarm; }
+
   bool isShaderStageEnvironment() const {
     EnvironmentType Env = getEnvironment();
     return Env == Triple::Pixel || Env == Triple::Vertex ||
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index f82e169..0ac5e7e 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1343,8 +1343,7 @@ struct InformationCache {
 
   /// Return all functions that might be called indirectly, only valid for
   /// closed world modules (see isClosedWorldModule).
-  const ArrayRef<Function *>
-  getIndirectlyCallableFunctions(Attributor &A) const;
+  ArrayRef<Function *> getIndirectlyCallableFunctions(Attributor &A) const;
 
   /// Return the flat address space if the associated target has.
   std::optional<unsigned> getFlatAddressSpace() const;
diff --git a/llvm/include/llvm/Transforms/Utils/Instrumentation.h b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
index 0e2c0d9..e57083b 100644
--- a/llvm/include/llvm/Transforms/Utils/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
@@ -162,6 +162,7 @@ struct SanitizerCoverageOptions {
   bool TraceStores = false;
   bool CollectControlFlow = false;
   bool GatedCallbacks = false;
+  int StackDepthCallbackMin = 0;
 
   SanitizerCoverageOptions() = default;
 };
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index f4946c3..27bd179 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -739,28 +739,49 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) {
   AAR.reset(
       new AAResults(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F)));
 
+  // Add any target-specific alias analyses that should be run early.
+  auto *ExtWrapperPass = getAnalysisIfAvailable<ExternalAAWrapperPass>();
+  if (ExtWrapperPass && ExtWrapperPass->runEarly() && ExtWrapperPass->CB) {
+    LLVM_DEBUG(dbgs() << "AAResults register Early ExternalAA: "
+                      << ExtWrapperPass->getPassName() << "\n");
+    ExtWrapperPass->CB(*this, F, *AAR);
+  }
+
   // BasicAA is always available for function analyses. Also, we add it first
   // so that it can trump TBAA results when it proves MustAlias.
   // FIXME: TBAA should have an explicit mode to support this and then we
   // should reconsider the ordering here.
-  if (!DisableBasicAA)
+  if (!DisableBasicAA) {
+    LLVM_DEBUG(dbgs() << "AAResults register BasicAA\n");
     AAR->addAAResult(getAnalysis<BasicAAWrapperPass>().getResult());
+  }
 
   // Populate the results with the currently available AAs.
-  if (auto *WrapperPass = getAnalysisIfAvailable<ScopedNoAliasAAWrapperPass>())
+  if (auto *WrapperPass =
+          getAnalysisIfAvailable<ScopedNoAliasAAWrapperPass>()) {
+    LLVM_DEBUG(dbgs() << "AAResults register ScopedNoAliasAA\n");
     AAR->addAAResult(WrapperPass->getResult());
-  if (auto *WrapperPass = getAnalysisIfAvailable<TypeBasedAAWrapperPass>())
+  }
+  if (auto *WrapperPass = getAnalysisIfAvailable<TypeBasedAAWrapperPass>()) {
+    LLVM_DEBUG(dbgs() << "AAResults register TypeBasedAA\n");
     AAR->addAAResult(WrapperPass->getResult());
-  if (auto *WrapperPass = getAnalysisIfAvailable<GlobalsAAWrapperPass>())
+  }
+  if (auto *WrapperPass = getAnalysisIfAvailable<GlobalsAAWrapperPass>()) {
+    LLVM_DEBUG(dbgs() << "AAResults register GlobalsAA\n");
     AAR->addAAResult(WrapperPass->getResult());
-  if (auto *WrapperPass = getAnalysisIfAvailable<SCEVAAWrapperPass>())
+  }
+  if (auto *WrapperPass = getAnalysisIfAvailable<SCEVAAWrapperPass>()) {
+    LLVM_DEBUG(dbgs() << "AAResults register SCEVAA\n");
     AAR->addAAResult(WrapperPass->getResult());
+  }
 
   // If available, run an external AA providing callback over the results as
   // well.
-  if (auto *WrapperPass = getAnalysisIfAvailable<ExternalAAWrapperPass>())
-    if (WrapperPass->CB)
-      WrapperPass->CB(*this, F, *AAR);
+  if (ExtWrapperPass && !ExtWrapperPass->runEarly() && ExtWrapperPass->CB) {
+    LLVM_DEBUG(dbgs() << "AAResults register Late ExternalAA: "
+                      << ExtWrapperPass->getPassName() << "\n");
+    ExtWrapperPass->CB(*this, F, *AAR);
+  }
 
   // Analyses don't mutate the IR, so return false.
   return false;
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 5a2943d..85e3be9 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6377,15 +6377,6 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
     if (isSplatValue(Op0))
       return Op0;
     break;
-  case Intrinsic::frexp: {
-    // Frexp is idempotent with the added complication of the struct return.
-    if (match(Op0, m_ExtractValue<0>(m_Value(X)))) {
-      if (match(X, m_Intrinsic<Intrinsic::frexp>(m_Value())))
-        return X;
-    }
-
-    break;
-  }
   default:
     break;
   }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7ec9bdb..f222a99 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2148,10 +2148,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
                          "different type sizes\n");
     return Dependence::Unknown;
   }
-
-  if (!CommonStride)
-    return Dependence::Unknown;
-
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
                            VectorizerParams::VectorizationFactor : 1);
@@ -2162,7 +2158,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
   // It's not vectorizable if the distance is smaller than the minimum distance
   // needed for a vectroized/unrolled version. Vectorizing one iteration in
-  // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
+  // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
   // (No need to plus the last gap distance).
   //
   // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
@@ -2186,11 +2182,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
   // the minimum distance needed is 28, which is greater than distance. It is
   // not safe to do vectorization.
+  //
+  // We use MaxStride (maximum of src and sink strides) to get a conservative
+  // lower bound on the MinDistanceNeeded in case of different strides.
 
   // We know that Dist is positive, but it may not be constant. Use the signed
   // minimum for computations below, as this ensures we compute the closest
   // possible dependence distance.
-  uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
+  uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
   if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
     if (!ConstDist) {
       // For non-constant distances, we checked the lower bound of the
@@ -2236,7 +2235,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride))
     return Dependence::BackwardVectorizableButPreventsForwarding;
 
-  uint64_t MaxVF = MinDepDistBytes / *CommonStride;
+  uint64_t MaxVF = MinDepDistBytes / MaxStride;
   LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
                     << " with max VF = " << MaxVF << '\n');
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index e2fd2aa..3945dd4 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -29,7 +29,7 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
                           "Accelerate framework"),
                clEnumValN(TargetLibraryInfoImpl::DarwinLibSystemM,
                           "Darwin_libsystem_m", "Darwin libsystem_m"),
-               clEnumValN(TargetLibraryInfoImpl::LIBMVEC_X86, "LIBMVEC-X86",
+               clEnumValN(TargetLibraryInfoImpl::LIBMVEC, "LIBMVEC",
                           "GLIBC Vector Math library"),
                clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV",
                           "IBM MASS vector library"),
@@ -1360,8 +1360,15 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     addVectorizableFunctions(VecFuncs_DarwinLibSystemM);
     break;
   }
-  case LIBMVEC_X86: {
-    addVectorizableFunctions(VecFuncs_LIBMVEC_X86);
+  case LIBMVEC: {
+    switch (TargetTriple.getArch()) {
+    default:
+      break;
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      addVectorizableFunctions(VecFuncs_LIBMVEC_X86);
+      break;
+    }
     break;
   }
   case MASSV: {
diff --git a/llvm/lib/Analysis/TypeMetadataUtils.cpp b/llvm/lib/Analysis/TypeMetadataUtils.cpp
index 8099fbc..9ec0785 100644
--- a/llvm/lib/Analysis/TypeMetadataUtils.cpp
+++ b/llvm/lib/Analysis/TypeMetadataUtils.cpp
@@ -54,9 +54,6 @@ findCallsAtConstantOffset(SmallVectorImpl<DevirtCallSite> &DevirtCalls,
 static void findLoadCallsAtConstantOffset(
     const Module *M, SmallVectorImpl<DevirtCallSite> &DevirtCalls, Value *VPtr,
     int64_t Offset, const CallInst *CI, DominatorTree &DT) {
-  if (!VPtr->hasUseList())
-    return;
-
   for (const Use &U : VPtr->uses()) {
     Value *User = U.getUser();
     if (isa<BitCastInst>(User)) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index cdedc65..ba6da4d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5357,7 +5357,9 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
     case Intrinsic::maxnum:
     case Intrinsic::minnum:
     case Intrinsic::minimum:
-    case Intrinsic::maximum: {
+    case Intrinsic::maximum:
+    case Intrinsic::minimumnum:
+    case Intrinsic::maximumnum: {
       KnownFPClass KnownLHS, KnownRHS;
       computeKnownFPClass(II->getArgOperand(0), DemandedElts, InterestedClasses,
                           KnownLHS, Depth + 1, Q);
@@ -5368,10 +5370,12 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
       Known = KnownLHS | KnownRHS;
 
       // If either operand is not NaN, the result is not NaN.
-      if (NeverNaN && (IID == Intrinsic::minnum || IID == Intrinsic::maxnum))
+      if (NeverNaN &&
+          (IID == Intrinsic::minnum || IID == Intrinsic::maxnum ||
+           IID == Intrinsic::minimumnum || IID == Intrinsic::maximumnum))
         Known.knownNot(fcNan);
 
-      if (IID == Intrinsic::maxnum) {
+      if (IID == Intrinsic::maxnum || IID == Intrinsic::maximumnum) {
         // If at least one operand is known to be positive, the result must be
         // positive.
         if ((KnownLHS.cannotBeOrderedLessThanZero() &&
@@ -5385,7 +5389,7 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
         if (KnownLHS.cannotBeOrderedLessThanZero() ||
             KnownRHS.cannotBeOrderedLessThanZero())
           Known.knownNot(KnownFPClass::OrderedLessThanZeroMask);
-      } else if (IID == Intrinsic::minnum) {
+      } else if (IID == Intrinsic::minnum || IID == Intrinsic::minimumnum) {
         // If at least one operand is known to be negative, the result must be
         // negative.
         if ((KnownLHS.cannotBeOrderedGreaterThanZero() &&
@@ -5393,13 +5397,14 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
             (KnownRHS.cannotBeOrderedGreaterThanZero() &&
              KnownRHS.isKnownNeverNaN()))
           Known.knownNot(KnownFPClass::OrderedGreaterThanZeroMask);
-      } else {
+      } else if (IID == Intrinsic::minimum) {
         // If at least one operand is known to be negative, the result must be
         // negative.
         if (KnownLHS.cannotBeOrderedGreaterThanZero() ||
             KnownRHS.cannotBeOrderedGreaterThanZero())
           Known.knownNot(KnownFPClass::OrderedGreaterThanZeroMask);
-      }
+      } else
+        llvm_unreachable("unhandled intrinsic");
 
       // Fixup zero handling if denormals could be returned as a zero.
       //
@@ -5427,15 +5432,20 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
             Known.signBitMustBeOne();
           else
             Known.signBitMustBeZero();
-        } else if ((IID == Intrinsic::maximum || IID == Intrinsic::minimum) ||
+        } else if ((IID == Intrinsic::maximum || IID == Intrinsic::minimum ||
+                    IID == Intrinsic::maximumnum ||
+                    IID == Intrinsic::minimumnum) ||
+                   // FIXME: Should be using logical zero versions
                    ((KnownLHS.isKnownNeverNegZero() ||
                      KnownRHS.isKnownNeverPosZero()) &&
                     (KnownLHS.isKnownNeverPosZero() ||
                      KnownRHS.isKnownNeverNegZero()))) {
-          if ((IID == Intrinsic::maximum || IID == Intrinsic::maxnum) &&
+          if ((IID == Intrinsic::maximum || IID == Intrinsic::maximumnum ||
+               IID == Intrinsic::maxnum) &&
               (KnownLHS.SignBit == false || KnownRHS.SignBit == false))
             Known.signBitMustBeZero();
-          else if ((IID == Intrinsic::minimum || IID == Intrinsic::minnum) &&
+          else if ((IID == Intrinsic::minimum || IID == Intrinsic::minimumnum ||
+                    IID == Intrinsic::minnum) &&
                    (KnownLHS.SignBit == true || KnownRHS.SignBit == true))
             Known.signBitMustBeOne();
         }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 96f86eb..fc7f460 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -8869,8 +8869,6 @@ bool LLParser::parseMDNodeVector(SmallVectorImpl<Metadata *> &Elts) {
 //===----------------------------------------------------------------------===//
 bool LLParser::sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes,
                                 SMLoc Loc) {
-  if (!V->hasUseList())
-    return false;
   if (V->use_empty())
     return error(Loc, "value has no uses");
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index e510320..4074ed65 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3860,10 +3860,6 @@ Error BitcodeReader::parseUseLists() {
         V = FunctionBBs[ID];
       } else
         V = ValueList[ID];
-
-      if (!V->hasUseList())
-        break;
-
       unsigned NumUses = 0;
       SmallDenseMap<const Use *, unsigned, 16> Order;
       for (const Use &U : V->materialized_uses()) {
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 1fdb808..9f735f7 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -230,9 +230,6 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
 
 static void predictValueUseListOrder(const Value *V, const Function *F,
                                      OrderMap &OM, UseListOrderStack &Stack) {
-  if (!V->hasUseList())
-    return;
-
   auto &IDPair = OM[V];
   assert(IDPair.first && "Unmapped value");
   if (IDPair.second)
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index bdcd54a..eb07696 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -382,7 +382,8 @@ Align AsmPrinter::getGVAlignment(const GlobalObject *GV, const DataLayout &DL,
   return Alignment;
 }
 
-AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
+AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer,
+                       char &ID)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
       OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
       SM(*this) {
@@ -560,7 +561,8 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = M.getCodeViewFlag();
-    if (EmitCodeView && TM.getTargetTriple().isOSWindows())
+    if (EmitCodeView &&
+        (TM.getTargetTriple().isOSWindows() || TM.getTargetTriple().isUEFI()))
       Handlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
       if (hasDebugInfo()) {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 2c53a9c..f9dcb47 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8591,9 +8591,6 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
     return false;
 
   Value *X = Cmp->getOperand(0);
-  if (!X->hasUseList())
-    return false;
-
   APInt CmpC = cast<ConstantInt>(Cmp->getOperand(1))->getValue();
 
   for (auto *U : X->users()) {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 90c6c28..f4fe0b3 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1034,9 +1034,6 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
   if (!isa<VectorType>(R->getType()) || !isa<VectorType>(I->getType()))
     return nullptr;
 
-  if (!R->hasUseList() || !I->hasUseList())
-    return nullptr;
-
   auto CommonUser =
       findCommonBetweenCollections<Value *>(R->users(), I->users());
   if (!CommonUser)
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 73f41c0..04d89d6 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -100,11 +100,11 @@ private:
   unsigned MaxFactor = 0u;
 
   /// Transform an interleaved load into target specific intrinsics.
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *Load,
                             SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleaved store into target specific intrinsics.
-  bool lowerInterleavedStore(StoreInst *SI,
+  bool lowerInterleavedStore(Instruction *Store,
                              SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +131,7 @@ private:
   /// made.
   bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
                             SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
-                            LoadInst *LI);
+                            Instruction *LI);
 };
 
 class InterleavedAccess : public FunctionPass {
@@ -249,11 +249,33 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
+// Return the corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
+static Value *getMask(Value *WideMask, unsigned Factor,
+                      ElementCount LeafValueEC);
+
+static Value *getMask(Value *WideMask, unsigned Factor,
+                      VectorType *LeafValueTy) {
+  return getMask(WideMask, Factor, LeafValueTy->getElementCount());
+}
+
 bool InterleavedAccessImpl::lowerInterleavedLoad(
-    LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+    Instruction *Load, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (isa<ScalableVectorType>(Load->getType()))
     return false;
 
+  if (auto *LI = dyn_cast<LoadInst>(Load)) {
+    if (!LI->isSimple())
+      return false;
+  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+    // Require a constant mask.
+    if (!isa<ConstantVector>(VPLoad->getMaskParam()))
+      return false;
+  } else {
+    llvm_unreachable("unsupported load operation");
+  }
+
   // Check if all users of this load are shufflevectors. If we encounter any
   // users that are extractelement instructions or binary operators, we save
   // them to later check if they can be modified to extract from one of the
@@ -265,7 +287,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   // binop are the same load.
   SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
 
-  for (auto *User : LI->users()) {
+  for (auto *User : Load->users()) {
     auto *Extract = dyn_cast<ExtractElementInst>(User);
     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
       Extracts.push_back(Extract);
@@ -294,7 +316,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   unsigned Factor, Index;
 
   unsigned NumLoadElements =
-      cast<FixedVectorType>(LI->getType())->getNumElements();
+      cast<FixedVectorType>(Load->getType())->getNumElements();
   auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
   // Check if the first shufflevector is DE-interleave shuffle.
   if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
@@ -327,9 +349,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
 
     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
 
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == Load)
       Indices.push_back(Index);
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == Load)
       Indices.push_back(Index);
   }
 
@@ -339,25 +361,45 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     return false;
 
   bool BinOpShuffleChanged =
-      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
+    Value *LaneMask =
+        getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
+    if (!LaneMask)
+      return false;
 
-  // Try to create target specific intrinsics to replace the load and shuffles.
-  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
-    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-    return !Extracts.empty() || BinOpShuffleChanged;
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
+
+    // Sometimes the number of Shuffles might be less than Factor, we have to
+    // fill the gaps with null. Also, lowerInterleavedVPLoad
+    // expects them to be sorted.
+    SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
+    for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
+      ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
+    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
+  } else {
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
+
+    // Try to create target specific intrinsics to replace the load and
+    // shuffles.
+    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
+                                   Factor))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
   }
 
   DeadInsts.insert_range(Shuffles);
 
-  DeadInsts.insert(LI);
+  DeadInsts.insert(Load);
   return true;
 }
 
 bool InterleavedAccessImpl::replaceBinOpShuffles(
     ArrayRef<ShuffleVectorInst *> BinOpShuffles,
-    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *Load) {
   for (auto *SVI : BinOpShuffles) {
     BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
     Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +422,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
                       << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
     RecursivelyDeleteTriviallyDeadInstructions(SVI);
-    if (NewSVI1->getOperand(0) == LI)
+    if (NewSVI1->getOperand(0) == Load)
       Shuffles.push_back(NewSVI1);
-    if (NewSVI2->getOperand(0) == LI)
+    if (NewSVI2->getOperand(0) == Load)
       Shuffles.push_back(NewSVI2);
   }
 
@@ -454,27 +496,77 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 }
 
 bool InterleavedAccessImpl::lowerInterleavedStore(
-    StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!SI->isSimple())
-    return false;
+    Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  Value *StoredValue;
+  if (auto *SI = dyn_cast<StoreInst>(Store)) {
+    if (!SI->isSimple())
+      return false;
+    StoredValue = SI->getValueOperand();
+  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
+    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+    // Require a constant mask.
+    if (!isa<ConstantVector>(VPStore->getMaskParam()))
+      return false;
+    StoredValue = VPStore->getArgOperand(0);
+  } else {
+    llvm_unreachable("unsupported store operation");
+  }
 
-  auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
   if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
     return false;
 
+  unsigned NumStoredElements =
+      cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
   if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
+  assert(NumStoredElements % Factor == 0 &&
+         "number of stored element should be a multiple of Factor");
+
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
+    unsigned LaneMaskLen = NumStoredElements / Factor;
+    Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
+                              ElementCount::getFixed(LaneMaskLen));
+    if (!LaneMask)
+      return false;
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
+                      << "\n");
 
-  // Try to create target specific intrinsics to replace the store and shuffle.
-  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
-    return false;
+    IRBuilder<> Builder(VPStore);
+    // We need to effectively de-interleave the shufflemask
+    // because lowerInterleavedVPStore expects individual de-interleaved
+    // values.
+    SmallVector<Value *, 10> NewShuffles;
+    SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
+    auto ShuffleMask = SVI->getShuffleMask();
+
+    for (unsigned i = 0; i < Factor; i++) {
+      for (unsigned j = 0; j < LaneMaskLen; j++)
+        NewShuffleMask[j] = ShuffleMask[i + Factor * j];
+
+      NewShuffles.push_back(Builder.CreateShuffleVector(
+          SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
+    }
+
+    // Try to create target specific intrinsics to replace the vp.store and
+    // shuffle.
+    if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
+      // We already created new shuffles.
+      return true;
+  } else {
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
+
+    // Try to create target specific intrinsics to replace the store and
+    // shuffle.
+    if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
+      return false;
+  }
 
   // Already have a new target specific interleaved store. Erase the old store.
-  DeadInsts.insert(SI);
+  DeadInsts.insert(Store);
   DeadInsts.insert(SVI);
   return true;
 }
@@ -630,10 +722,8 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
   return true;
 }
 
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
-static Value *getMask(Value *WideMask, unsigned Factor) {
-  using namespace llvm::PatternMatch;
+static Value *getMask(Value *WideMask, unsigned Factor,
+                      ElementCount LeafValueEC) {
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     SmallVector<Value *, 8> Operands;
     SmallVector<Instruction *, 8> DeadInsts;
@@ -644,13 +734,26 @@ static Value *getMask(Value *WideMask, unsigned Factor) {
     }
   }
 
-  if (match(WideMask, m_AllOnes())) {
-    // Scale the vector length of all-ones mask.
-    ElementCount OrigEC =
-        cast<VectorType>(WideMask->getType())->getElementCount();
-    assert(OrigEC.getKnownMinValue() % Factor == 0);
-    return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor),
-                                    cast<Constant>(WideMask)->getSplatValue());
+  if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
+    if (auto *Splat = ConstMask->getSplatValue())
+      // All-ones or all-zeros mask.
+      return ConstantVector::getSplat(LeafValueEC, Splat);
+
+    if (LeafValueEC.isFixed()) {
+      unsigned LeafMaskLen = LeafValueEC.getFixedValue();
+      SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
+      // If this is a fixed-length constant mask, each lane / leaf has to
+      // use the same mask. This is done by checking if every group with Factor
+      // number of elements in the interleaved mask has homogeneous values.
+      for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
+        Constant *C = ConstMask->getAggregateElement(Idx);
+        if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
+          return nullptr;
+        LeafMask[Idx / Factor] = C;
+      }
+
+      return ConstantVector::get(LeafMask);
+    }
   }
 
   return nullptr;
@@ -673,9 +776,10 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
     if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
       return false;
-    // Check mask operand. Handle both all-true and interleaved mask.
+    // Check mask operand. Handle both all-true/false and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
-    Value *Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor,
+                          cast<VectorType>(DeinterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
@@ -684,8 +788,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
-                                                  DeinterleaveValues))
+    if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
       return false;
 
   } else {
@@ -727,7 +830,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
       return false;
 
     Value *WideMask = VPStore->getOperand(2);
-    Value *Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor,
+                          cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
@@ -736,8 +840,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
     // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
-                                                 InterleaveValues))
+    if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
       return false;
   } else {
     auto *SI = cast<StoreInst>(StoredBy);
@@ -763,12 +866,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
   SmallSetVector<Instruction *, 32> DeadInsts;
   bool Changed = false;
 
+  using namespace PatternMatch;
   for (auto &I : instructions(F)) {
-    if (auto *LI = dyn_cast<LoadInst>(&I))
-      Changed |= lowerInterleavedLoad(LI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Load(m_Value()),
+                              m_Intrinsic<Intrinsic::vp_load>())))
+      Changed |= lowerInterleavedLoad(&I, DeadInsts);
 
-    if (auto *SI = dyn_cast<StoreInst>(&I))
-      Changed |= lowerInterleavedStore(SI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+                              m_Intrinsic<Intrinsic::vp_store>())))
+      Changed |= lowerInterleavedStore(&I, DeadInsts);
 
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 31acfef..b8a7eb6 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -74,6 +74,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-scheduler"
 
+STATISTIC(NumInstrsInSourceOrderPreRA,
+          "Number of instructions in source order after pre-RA scheduling");
+STATISTIC(NumInstrsInSourceOrderPostRA,
+          "Number of instructions in source order after post-RA scheduling");
+STATISTIC(NumInstrsScheduledPreRA,
+          "Number of instructions scheduled by pre-RA scheduler");
+STATISTIC(NumInstrsScheduledPostRA,
+          "Number of instructions scheduled by post-RA scheduler");
 STATISTIC(NumClustered, "Number of load/store pairs clustered");
 
 namespace llvm {
@@ -3505,6 +3513,9 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
     RegionPolicy.OnlyBottomUp = false;
     RegionPolicy.OnlyTopDown = false;
   }
+
+  BotIdx = NumRegionInstrs - 1;
+  this->NumRegionInstrs = NumRegionInstrs;
 }
 
 void GenericScheduler::dumpPolicy() const {
@@ -3981,6 +3992,18 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
 
   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                     << *SU->getInstr());
+
+  if (IsTopNode) {
+    if (SU->NodeNum == TopIdx++)
+      ++NumInstrsInSourceOrderPreRA;
+  } else {
+    assert(BotIdx < NumRegionInstrs && "out of bounds");
+    if (SU->NodeNum == BotIdx--)
+      ++NumInstrsInSourceOrderPreRA;
+  }
+
+  NumInstrsScheduledPreRA += 1;
+
   return SU;
 }
 
@@ -4104,6 +4127,9 @@ void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
     RegionPolicy.OnlyBottomUp = false;
     RegionPolicy.OnlyTopDown = false;
   }
+
+  BotIdx = NumRegionInstrs - 1;
+  this->NumRegionInstrs = NumRegionInstrs;
 }
 
 void PostGenericScheduler::registerRoots() {
@@ -4323,6 +4349,18 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
 
   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                     << *SU->getInstr());
+
+  if (IsTopNode) {
+    if (SU->NodeNum == TopIdx++)
+      ++NumInstrsInSourceOrderPostRA;
+  } else {
+    assert(BotIdx < NumRegionInstrs && "out of bounds");
+    if (SU->NodeNum == BotIdx--)
+      ++NumInstrsInSourceOrderPostRA;
+  }
+
+  NumInstrsScheduledPostRA += 1;
+
   return SU;
 }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index c941529..e1bdc7e 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1830,7 +1830,7 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
       // Append "$symbol" to the section name *before* IR-level mangling is
       // applied when targetting mingw. This is what GCC does, and the ld.bfd
       // COFF linker will not properly handle comdats otherwise.
-      if (getContext().getTargetTriple().isWindowsGNUEnvironment())
+      if (getContext().getTargetTriple().isOSCygMing())
         raw_svector_ostream(Name) << '$' << ComdatGV->getName();
 
       return getContext().getCOFFSection(Name, Characteristics, COMDATSymName,
diff --git a/llvm/lib/DWARFLinker/Parallel/ArrayList.h b/llvm/lib/DWARFLinker/Parallel/ArrayList.h
index c48f828..d99fdcc 100644
--- a/llvm/lib/DWARFLinker/Parallel/ArrayList.h
+++ b/llvm/lib/DWARFLinker/Parallel/ArrayList.h
@@ -137,7 +137,7 @@ protected:
     NewGroup->Next = nullptr;
 
     // Try to replace current group with allocated one.
-    if (AtomicGroup.compare_exchange_weak(CurGroup, NewGroup))
+    if (AtomicGroup.compare_exchange_strong(CurGroup, NewGroup))
       return true;
 
     // Put allocated group as last group.
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
index 4ce43c1..b14b872 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
@@ -110,14 +110,13 @@ private:
   }
 };
 
-template <typename ELFT>
-class ELFLinkGraphBuilder_i386 : public ELFLinkGraphBuilder<ELFT> {
+class ELFLinkGraphBuilder_i386 : public ELFLinkGraphBuilder<object::ELF32LE> {
 private:
-  static Expected<i386::EdgeKind_i386> getRelocationKind(const uint32_t Type) {
+  using ELFT = object::ELF32LE;
+
+  Expected<i386::EdgeKind_i386> getRelocationKind(const uint32_t Type) {
     using namespace i386;
     switch (Type) {
-    case ELF::R_386_NONE:
-      return EdgeKind_i386::None;
     case ELF::R_386_32:
       return EdgeKind_i386::Pointer32;
     case ELF::R_386_PC32:
@@ -128,6 +127,9 @@ private:
       return EdgeKind_i386::PCRel16;
     case ELF::R_386_GOT32:
       return EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT;
+    case ELF::R_386_GOT32X:
+      // TODO: Add a relaxable edge kind and update relaxation optimization.
+      return EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT;
     case ELF::R_386_GOTPC:
       return EdgeKind_i386::Delta32;
     case ELF::R_386_GOTOFF:
@@ -136,8 +138,9 @@ private:
       return EdgeKind_i386::BranchPCRel32;
     }
 
-    return make_error<JITLinkError>("Unsupported i386 relocation:" +
-                                    formatv("{0:d}", Type));
+    return make_error<JITLinkError>(
+        "In " + G->getName() + ": Unsupported i386 relocation type " +
+        object::getELFRelocationTypeName(ELF::EM_386, Type));
   }
 
   Error addRelocations() override {
@@ -165,6 +168,12 @@ private:
                             Block &BlockToFix) {
     using Base = ELFLinkGraphBuilder<ELFT>;
 
+    auto ELFReloc = Rel.getType(false);
+
+    // R_386_NONE is a no-op.
+    if (LLVM_UNLIKELY(ELFReloc == ELF::R_386_NONE))
+      return Error::success();
+
     uint32_t SymbolIndex = Rel.getSymbol(false);
     auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
     if (!ObjSymbol)
@@ -179,7 +188,7 @@ private:
                   Base::GraphSymbols.size()),
           inconvertibleErrorCode());
 
-    Expected<i386::EdgeKind_i386> Kind = getRelocationKind(Rel.getType(false));
+    Expected<i386::EdgeKind_i386> Kind = getRelocationKind(ELFReloc);
     if (!Kind)
       return Kind.takeError();
 
@@ -187,8 +196,6 @@ private:
     int64_t Addend = 0;
 
     switch (*Kind) {
-    case i386::EdgeKind_i386::None:
-      break;
     case i386::EdgeKind_i386::Pointer32:
     case i386::EdgeKind_i386::PCRel32:
     case i386::EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT:
@@ -253,9 +260,9 @@ createLinkGraphFromELFObject_i386(MemoryBufferRef ObjectBuffer,
 
   auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF32LE>>(**ELFObj);
 
-  return ELFLinkGraphBuilder_i386<object::ELF32LE>(
-             (*ELFObj)->getFileName(), ELFObjFile.getELFFile(), std::move(SSP),
-             (*ELFObj)->makeTriple(), std::move(*Features))
+  return ELFLinkGraphBuilder_i386((*ELFObj)->getFileName(),
+                                  ELFObjFile.getELFFile(), std::move(SSP),
+                                  (*ELFObj)->makeTriple(), std::move(*Features))
       .buildGraph();
 }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/i386.cpp b/llvm/lib/ExecutionEngine/JITLink/i386.cpp
index e984bb1..f714716 100644
--- a/llvm/lib/ExecutionEngine/JITLink/i386.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/i386.cpp
@@ -18,8 +18,6 @@ namespace llvm::jitlink::i386 {
 
 const char *getEdgeKindName(Edge::Kind K) {
   switch (K) {
-  case None:
-    return "None";
   case Pointer32:
     return "Pointer32";
   case PCRel32:
diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
index ed7c57a..52080de 100644
--- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
+++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
@@ -23,7 +23,7 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
                                              TargetTriple);
     break;
   case VectorLibrary::LIBMVEC:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC_X86,
+    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC,
                                              TargetTriple);
     break;
   case VectorLibrary::MASSV:
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 7223dd8..12edf6f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -125,15 +125,11 @@ static void orderValue(const Value *V, OrderMap &OM) {
   if (OM.lookup(V))
     return;
 
-  if (const Constant *C = dyn_cast<Constant>(V)) {
-    if (isa<ConstantData>(C))
-      return;
-
+  if (const Constant *C = dyn_cast<Constant>(V))
     if (C->getNumOperands() && !isa<GlobalValue>(C))
       for (const Value *Op : C->operands())
         if (!isa<BasicBlock>(Op) && !isa<GlobalValue>(Op))
           orderValue(Op, OM);
-  }
 
   // Note: we cannot cache this lookup above, since inserting into the map
   // changes the map's size, and thus affects the other IDs.
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 54e5e6d5..6f85811 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -373,7 +373,7 @@ std::optional<BasicBlock::iterator> Instruction::getInsertionPointAfterDef() {
 }
 
 bool Instruction::isOnlyUserOfAnyOperand() {
-  return any_of(operands(), [](const Value *V) { return V->hasOneUser(); });
+  return any_of(operands(), [](Value *V) { return V->hasOneUser(); });
 }
 
 void Instruction::setHasNoUnsignedWrap(bool b) {
diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp
index 67882ba..99a8938 100644
--- a/llvm/lib/IR/Use.cpp
+++ b/llvm/lib/IR/Use.cpp
@@ -19,15 +19,11 @@ void Use::swap(Use &RHS) {
   std::swap(Next, RHS.Next);
   std::swap(Prev, RHS.Prev);
 
-  if (Prev)
-    *Prev = this;
-
+  *Prev = this;
   if (Next)
     Next->Prev = &Next;
 
-  if (RHS.Prev)
-    *RHS.Prev = &RHS;
-
+  *RHS.Prev = &RHS;
   if (RHS.Next)
     RHS.Next->Prev = &RHS.Next;
 }
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index d6cb65d..aa97b70 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -53,7 +53,7 @@ static inline Type *checkType(Type *Ty) {
 Value::Value(Type *ty, unsigned scid)
     : SubclassID(scid), HasValueHandle(0), SubclassOptionalData(0),
       SubclassData(0), NumUserOperands(0), IsUsedByMD(false), HasName(false),
-      HasMetadata(false), VTy(checkType(ty)) {
+      HasMetadata(false), VTy(checkType(ty)), UseList(nullptr) {
   static_assert(ConstantFirstVal == 0, "!(SubclassID < ConstantFirstVal)");
   // FIXME: Why isn't this in the subclass gunk??
   // Note, we cannot call isa<CallInst> before the CallInst has been
@@ -148,18 +148,10 @@ void Value::destroyValueName() {
 }
 
 bool Value::hasNUses(unsigned N) const {
-  if (!UseList)
-    return N == 0;
-
-  // TODO: Disallow for ConstantData and remove !UseList check?
   return hasNItems(use_begin(), use_end(), N);
 }
 
 bool Value::hasNUsesOrMore(unsigned N) const {
-  // TODO: Disallow for ConstantData and remove !UseList check?
-  if (!UseList)
-    return N == 0;
-
   return hasNItemsOrMore(use_begin(), use_end(), N);
 }
 
@@ -240,8 +232,6 @@ void Value::dropDroppableUse(Use &U) {
 }
 
 bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
-  assert(hasUseList() && "ConstantData has no use-list");
-
   // This can be computed either by scanning the instructions in BB, or by
   // scanning the use list of this Value. Both lists can be very long, but
   // usually one is quite short.
@@ -263,9 +253,6 @@ bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
 }
 
 unsigned Value::getNumUses() const {
-  // TODO: Disallow for ConstantData and remove !UseList check?
-  if (!UseList)
-    return 0;
   return (unsigned)std::distance(use_begin(), use_end());
 }
 
@@ -512,7 +499,6 @@ static bool contains(Value *Expr, Value *V) {
 #endif // NDEBUG
 
 void Value::doRAUW(Value *New, ReplaceMetadataUses ReplaceMetaUses) {
-  assert(hasUseList() && "Cannot replace constant data");
   assert(New && "Value::replaceAllUsesWith(<null>) is invalid!");
   assert(!contains(New, this) &&
          "this->replaceAllUsesWith(expr(this)) is NOT valid!");
@@ -858,7 +844,7 @@ bool Value::canBeFreed() const {
   // which is why we need the explicit opt in on a per collector basis.
   if (!F->hasGC())
     return true;
-
+  
   const auto &GCName = F->getGC();
   if (GCName == "statepoint-example") {
     auto *PT = cast<PointerType>(this->getType());
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index aee1259..f27a278 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -1769,11 +1769,9 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // Treat '}' as a valid identifier in this context.
     Lex();
     IDVal = "}";
-  } else if (Lexer.is(AsmToken::Star) &&
-             getTargetParser().starIsStartOfStatement()) {
-    // Accept '*' as a valid start of statement.
+  } else if (getTargetParser().tokenIsStartOfStatement(ID.getKind())) {
     Lex();
-    IDVal = "*";
+    IDVal = ID.getString();
   } else if (parseIdentifier(IDVal)) {
     if (!TheCondState.Ignore) {
       Lex(); // always eat a token
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index f172271b..5a85b30 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -2320,6 +2320,10 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
   // The order in which these are registered determines their priority when
   // being queried.
 
+  // Add any target-specific alias analyses that should be run early.
+  if (TM)
+    TM->registerEarlyDefaultAliasAnalyses(AA);
+
   // First we register the basic alias analysis that provides the majority of
   // per-function local AA logic. This is a stateless, on-demand local set of
   // AA techniques.
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 691e101..6668a29 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -826,14 +826,17 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) {
   int width = 0;
   for (int i = 0; i < depth; ++i) {
     Dl_info dlinfo;
-    dladdr(StackTrace[i], &dlinfo);
-    const char *name = strrchr(dlinfo.dli_fname, '/');
-
     int nwidth;
-    if (!name)
-      nwidth = strlen(dlinfo.dli_fname);
-    else
-      nwidth = strlen(name) - 1;
+    if (dladdr(StackTrace[i], &dlinfo) == 0) {
+      nwidth = 7; // "(error)"
+    } else {
+      const char *name = strrchr(dlinfo.dli_fname, '/');
+
+      if (!name)
+        nwidth = strlen(dlinfo.dli_fname);
+      else
+        nwidth = strlen(name) - 1;
+    }
 
     if (nwidth > width)
       width = nwidth;
@@ -841,15 +844,20 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) {
 
   for (int i = 0; i < depth; ++i) {
     Dl_info dlinfo;
-    dladdr(StackTrace[i], &dlinfo);
 
     OS << format("%-2d", i);
 
-    const char *name = strrchr(dlinfo.dli_fname, '/');
-    if (!name)
-      OS << format(" %-*s", width, static_cast<const char *>(dlinfo.dli_fname));
-    else
-      OS << format(" %-*s", width, name + 1);
+    if (dladdr(StackTrace[i], &dlinfo) == 0) {
+      OS << format(" %-*s", width, static_cast<const char *>("(error)"));
+      dlinfo.dli_sname = nullptr;
+    } else {
+      const char *name = strrchr(dlinfo.dli_fname, '/');
+      if (!name)
+        OS << format(" %-*s", width,
+                     static_cast<const char *>(dlinfo.dli_fname));
+      else
+        OS << format(" %-*s", width, name + 1);
+    }
 
     OS << format(" %#0*lx", (int)(sizeof(void *) * 2) + 2,
                  (unsigned long)StackTrace[i]);
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index ffa578d..5496ebd 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -77,6 +77,7 @@ ModulePass *createAArch64Arm64ECCallLoweringPass();
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
 void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
+void initializeAArch64AsmPrinterPass(PassRegistry &);
 void initializeAArch64PointerAuthPass(PassRegistry&);
 void initializeAArch64BranchTargetsPass(PassRegistry&);
 void initializeAArch64CFIFixupPass(PassRegistry&);
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 870df4c..38be677 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -96,9 +96,11 @@ class AArch64AsmPrinter : public AsmPrinter {
       SectionToImportedFunctionCalls;
 
 public:
+  static char ID;
+
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
-        FM(*this) {}
+      : AsmPrinter(TM, std::move(Streamer), ID),
+        MCInstLowering(OutContext, *this), FM(*this) {}
 
   StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
 
@@ -3523,6 +3525,11 @@ const MCExpr *AArch64AsmPrinter::lowerConstant(const Constant *CV,
   return AsmPrinter::lowerConstant(CV, BaseCV, Offset);
 }
 
+char AArch64AsmPrinter::ID = 0;
+
+INITIALIZE_PASS(AArch64AsmPrinter, "aarch64-asm-printer",
+                "AArch64 Assmebly Printer", false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 431076f..5693a55 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   initializeAArch64A53Fix835769Pass(PR);
   initializeAArch64A57FPLoadBalancingPass(PR);
   initializeAArch64AdvSIMDScalarPass(PR);
+  initializeAArch64AsmPrinterPass(PR);
   initializeAArch64BranchTargetsPass(PR);
   initializeAArch64CollectLOHPass(PR);
   initializeAArch64CompressJumpTablesPass(PR);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 6bd3fd1..8d83fef 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -633,7 +633,7 @@ bool AArch64RegisterBankInfo::isLoadFromFPType(const MachineInstr &MI) const {
     // Look at the first element of the array to determine its type
     if (isa<ArrayType>(EltTy))
       EltTy = EltTy->getArrayElementType();
-  } else if (!isa<Constant>(LdVal)) {
+  } else {
     // FIXME: grubbing around uses is pretty ugly, but with no more
     // `getPointerElementType` there's not much else we can do.
     for (const auto *LdUser : LdVal->users()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index bbdc8c6..b572f81 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -95,6 +95,8 @@ void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &);
 
 void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
 
+void initializeAMDGPUAsmPrinterPass(PassRegistry &);
+
 Pass *createAMDGPUAttributorLegacyPass();
 void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 5e684a7..491314d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1742,3 +1742,8 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
     EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
                             CurrentProgramInfo.LDSSize);
 }
+
+char AMDGPUAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
+                "AMDGPU Assembly Printer", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 4183bb6..23779047 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -36,6 +36,9 @@ class MetadataStreamer;
 } // namespace AMDGPU
 
 class AMDGPUAsmPrinter final : public AsmPrinter {
+public:
+  static char ID;
+
 private:
   unsigned CodeObjectVersion;
   void initializeTargetID(const Module &M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 98a3a98..dbe74b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2409,17 +2409,15 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           TII->isVMEM(MI))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
-           MI.mayLoad() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           MI.mayLoad() && TII->isVMEM(MI))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
-           MI.mayStore() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           MI.mayStore() && TII->isVMEM(MI))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ff8658e..9d13fac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2014,7 +2014,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
   // RegBankSelect.
   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
-    .legalFor({{S32}, {S64}});
+                        .legalFor({{S32}, {S64}})
+                        .clampScalar(0, S32, S64);
 
   if (ST.hasVOP3PInsts()) {
     SextInReg.lowerFor({{V2S16}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 91fe2a69b..12fd68d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -208,9 +208,13 @@ public:
 
   bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
 
-  bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; }
+  bool hasFP6BF6ConversionScaleInsts() const {
+    return HasFP6BF6ConversionScaleInsts;
+  }
 
-  bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }
+  bool hasF16BF16ToFP6BF6ConversionScaleInsts() const {
+    return HasF16BF16ToFP6BF6ConversionScaleInsts;
+  }
 
   bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 43e837d..c22b27a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -495,6 +495,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeR600EmitClauseMarkersPass(*PR);
   initializeR600MachineCFGStructurizerPass(*PR);
   initializeGlobalISel(*PR);
+  initializeAMDGPUAsmPrinterPass(*PR);
   initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
   initializeGCNDPPCombineLegacyPass(*PR);
   initializeSILowerI1CopiesLegacyPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index bfdd8cf..4c88b7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -232,7 +232,9 @@ public:
         State.ActiveFlat = true;
 
       // SMEM or VMEM clears hazards
-      if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) {
+      // FIXME: adapt to add FLAT without VALU (so !isLDSDMA())?
+      if ((SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI)) ||
+          SIInstrInfo::isSMRD(*MI)) {
         State.VCCHazard = HazardState::None;
         State.SALUHazards.reset();
         State.VALUHazards.reset();
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 7ef6285..f640747 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -337,8 +337,8 @@ public:
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::v2f16);
   }
 
-  bool isPackedFP32InputMods() const {
-    return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::v2f32);
+  bool isPackedVGPRFP32InputMods() const {
+    return isRegOrImmWithInputMods(AMDGPU::VReg_64RegClassID, MVT::v2f32);
   }
 
   bool isVReg() const {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index aaefe27..1561efe 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -183,10 +183,7 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (ST.hasNoDataDepHazard())
     return NoHazard;
 
-  // FIXME: Should flat be considered vmem?
-  if ((SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI))
-      && checkVMEMHazards(MI) > 0)
+  if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
     return HazardType;
 
   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
@@ -202,8 +199,8 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
     return HazardType;
 
   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
-       SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+       SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) &&
+      checkMAIVALUHazards(MI) > 0)
     return HazardType;
 
   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
@@ -229,9 +226,8 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
     return HazardType;
 
-  if ((SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI) ||
-       SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
+  if ((SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI)) &&
+      checkMAILdStHazards(MI) > 0)
     return HazardType;
 
   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
@@ -324,7 +320,7 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (ST.hasNoDataDepHazard())
     return WaitStates;
 
-  if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
+  if (SIInstrInfo::isVMEM(*MI))
     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
 
   if (SIInstrInfo::isVALU(*MI))
@@ -340,8 +336,8 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 
   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
-       SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+       SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) &&
+      checkMAIVALUHazards(MI) > 0)
     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
 
   if (MI->isInlineAsm())
@@ -369,9 +365,7 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (SIInstrInfo::isMAI(*MI))
     return std::max(WaitStates, checkMAIHazards(MI));
 
-  if (SIInstrInfo::isVMEM(*MI) ||
-      SIInstrInfo::isFLAT(*MI) ||
-      SIInstrInfo::isDS(*MI))
+  if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI))
     return std::max(WaitStates, checkMAILdStHazards(MI));
 
   if (ST.hasGFX950Insts() && isPermlane(*MI))
@@ -598,7 +592,7 @@ static bool breaksSMEMSoftClause(MachineInstr *MI) {
 }
 
 static bool breaksVMEMSoftClause(MachineInstr *MI) {
-  return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
+  return !SIInstrInfo::isVMEM(*MI);
 }
 
 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
@@ -1250,8 +1244,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
-    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
-        !SIInstrInfo::isFLAT(I))
+    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I))
       return false;
 
     for (const MachineOperand &Def : MI->defs()) {
@@ -1425,8 +1418,8 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
       HasLds |= SIInstrInfo::isDS(MI);
-      HasVmem |=
-          SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
+      HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
+                 SIInstrInfo::isSegmentSpecificFLAT(MI);
       if (HasLds && HasVmem)
         return true;
     }
@@ -1450,7 +1443,8 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
   auto IsHazardInst = [](const MachineInstr &MI) {
     if (SIInstrInfo::isDS(MI))
       return 1;
-    if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
+    if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
+        SIInstrInfo::isSegmentSpecificFLAT(MI))
       return 2;
     return 0;
   };
@@ -1517,8 +1511,8 @@ bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
     if (WaitStates >= NoHazardWaitStates)
       return true;
     // Instructions which cause va_vdst==0 expire hazard
-    return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
-           SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
+    return SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
+           SIInstrInfo::isEXP(I);
   };
   auto GetWaitStatesFn = [](const MachineInstr &MI) {
     return SIInstrInfo::isVALU(MI) ? 1 : 0;
@@ -1549,8 +1543,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
   const Register VDSTReg = VDST->getReg();
 
   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
-    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
-        !SIInstrInfo::isDS(I))
+    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I))
       return false;
     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
   };
@@ -1635,8 +1628,8 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
       return HazardExpired;
 
     // Instructions which cause va_vdst==0 expire hazard
-    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
-        SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
+        SIInstrInfo::isEXP(I) ||
         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
       return HazardExpired;
@@ -1772,8 +1765,8 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
       return HazardExpired;
 
     // Instructions which cause va_vdst==0 expire hazard
-    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
-        SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
+        SIInstrInfo::isEXP(I) ||
         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
          I.getOperand(0).getImm() == 0x0fff))
       return HazardExpired;
@@ -2003,7 +1996,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
     return 0;
 
   auto IsHazardFn = [](const MachineInstr &I) {
-    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
+    if (!SIInstrInfo::isVMEM(I))
       return false;
     return SIInstrInfo::isFPAtomic(I);
   };
@@ -2625,9 +2618,7 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
 
   int WaitStatesNeeded = 0;
 
-  bool IsMem = SIInstrInfo::isVMEM(*MI) ||
-               SIInstrInfo::isFLAT(*MI) ||
-               SIInstrInfo::isDS(*MI);
+  bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
   bool IsVALU = SIInstrInfo::isVALU(*MI);
 
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 4802ed4..2768e0c 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -303,7 +303,7 @@ void AMDGPUCustomBehaviour::generateWaitCntInfo() {
 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
   return MCID.TSFlags & SIInstrFlags::MUBUF ||
          MCID.TSFlags & SIInstrFlags::MTBUF ||
-         MCID.TSFlags & SIInstrFlags::MIMG;
+         MCID.TSFlags & SIInstrFlags::MIMG || MCID.TSFlags & SIInstrFlags::FLAT;
 }
 
 // taken from SIInstrInfo::hasModifiersSet()
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index bbc0280..7524747 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -100,7 +100,7 @@ FunctionPass *llvm::createSIFormMemoryClausesLegacyPass() {
 }
 
 static bool isVMEMClauseInst(const MachineInstr &MI) {
-  return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
+  return SIInstrInfo::isVMEM(MI);
 }
 
 static bool isSMEMClauseInst(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 88ff04d..67c4cac 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -97,7 +97,8 @@ public:
   HardClauseType getHardClauseType(const MachineInstr &MI) {
     if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
       if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
-        if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+        if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
+            SIInstrInfo::isSegmentSpecificFLAT(MI)) {
           if (ST->hasNSAClauseBug()) {
             const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
             if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
@@ -121,7 +122,8 @@ public:
                                               : HARDCLAUSE_MIMG_LOAD
                               : HARDCLAUSE_MIMG_STORE;
         }
-        if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+        if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
+            SIInstrInfo::isSegmentSpecificFLAT(MI)) {
           return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC
                                               : HARDCLAUSE_VMEM_LOAD
                               : HARDCLAUSE_VMEM_STORE;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6f5083a..dc11e0a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -168,8 +168,8 @@ static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
     AMDGPU::S_WAIT_KMCNT};
 
 static bool updateVMCntOnly(const MachineInstr &Inst) {
-  return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
-         SIInstrInfo::isFLATScratch(Inst);
+  return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
+         SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst);
 }
 
 #ifndef NDEBUG
@@ -695,8 +695,8 @@ public:
 #endif // NDEBUG
   }
 
-  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
-  // FLAT instruction.
+  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
+  // instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     switch (Inst.getOpcode()) {
     case AMDGPU::GLOBAL_INV:
@@ -712,7 +712,7 @@ public:
     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
         VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
 
-    assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
+    assert(SIInstrInfo::isVMEM(Inst));
     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
     // these should use VM_CNT.
     if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
@@ -2466,8 +2466,9 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
 }
 
 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
-  return SIInstrInfo::isVMEM(MI) ||
-         (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
+  if (SIInstrInfo::isFLAT(MI))
+    return mayAccessVMEMThroughFlat(MI);
+  return SIInstrInfo::isVMEM(MI);
 }
 
 // Return true if it is better to flush the vmcnt counter in the preheader of
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6d5486..20bf405 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7777,8 +7777,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
       return;
     }
 
-    // If this is a v2s copy src from vgpr16 to sgpr32,
-    // replace vgpr copy to subreg_to_reg
+    // If this is a v2s copy src from 16bit to 32bit,
+    // replace vgpr copy to reg_sequence
     // This can be remove after we have sgpr16 in place
     if (ST.useRealTrue16Insts() && Inst.isCopy() &&
         Inst.getOperand(1).getReg().isVirtual() &&
@@ -7787,11 +7787,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
       if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
           32 == RI.getRegSizeInBits(*NewDstRC)) {
         Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+        Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
         BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
-                get(TargetOpcode::SUBREG_TO_REG), NewDstReg)
-            .add(MachineOperand::CreateImm(0))
-            .add(Inst.getOperand(1))
-            .add(MachineOperand::CreateImm(AMDGPU::lo16));
+                get(AMDGPU::IMPLICIT_DEF), Undef);
+        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+                get(AMDGPU::REG_SEQUENCE), NewDstReg)
+            .addReg(Inst.getOperand(1).getReg())
+            .addImm(AMDGPU::lo16)
+            .addReg(Undef)
+            .addImm(AMDGPU::hi16);
         Inst.eraseFromParent();
 
         MRI.replaceRegWith(DstReg, NewDstReg);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4b97f58..64ab064 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -449,7 +449,7 @@ public:
   }
 
   static bool isVMEM(const MachineInstr &MI) {
-    return isMUBUF(MI) || isMTBUF(MI) || isImage(MI);
+    return isMUBUF(MI) || isMTBUF(MI) || isImage(MI) || isFLAT(MI);
   }
 
   bool isVMEM(uint16_t Opcode) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index adc7cd0..3710a54 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1550,6 +1550,10 @@ class PackedFPInputModsMatchClass <int opSize> : AsmOperandClass {
   let PredicateMethod = "isPackedFP"#opSize#"InputMods";
 }
 
+class PackedVGPRFPInputModsMatchClass <int opSize> : PackedFPInputModsMatchClass<opSize> {
+  let PredicateMethod = "isPackedVGPRFP"#opSize#"InputMods";
+}
+
 class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass {
   let Name = "PackedInt"#opSize#"InputMods";
   let ParserMethod = "parseRegOrImm";
@@ -1559,7 +1563,7 @@ class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass {
 
 def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>;
 def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>;
-def PackedF32InputModsMatchClass : PackedFPInputModsMatchClass<32>;
+def PackedVGPRF32InputModsMatchClass : PackedVGPRFPInputModsMatchClass<32>;
 
 class PackedFPInputMods <PackedFPInputModsMatchClass matchClass> : InputMods <matchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
@@ -1571,7 +1575,7 @@ class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <
 
 def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>;
 def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
-def PackedF32InputMods : PackedFPInputMods<PackedF32InputModsMatchClass>;
+def PackedVGPRF32InputMods : PackedFPInputMods<PackedVGPRF32InputModsMatchClass>;
 
 def MFMALdScaleModifierOp : TImmLeaf<i32, [{
   return isUInt<2>(Imm);
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 8686a85..73f7a5c 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1049,7 +1049,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P>
 
   let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT<P.Src0VT>.ret,
                      getVOP3SrcForVT<P.Src0VT>.ret);
-  let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
+  let InsVOP3OpSel = (ins PackedVGPRF32InputMods: $src0_modifiers, Src0RC64:$src0,
                           Int32InputMods:     $src1_modifiers, Src1RC64:$src1,
                           FP32InputMods:      $src2_modifiers, Src2RC64:$src2,
                           VGPR_32:$vdst_in,   op_sel0:$op_sel);
diff --git a/llvm/lib/Target/ARC/ARC.h b/llvm/lib/Target/ARC/ARC.h
index 459f79c..dfab692 100644
--- a/llvm/lib/Target/ARC/ARC.h
+++ b/llvm/lib/Target/ARC/ARC.h
@@ -27,6 +27,7 @@ FunctionPass *createARCISelDag(ARCTargetMachine &TM, CodeGenOptLevel OptLevel);
 FunctionPass *createARCExpandPseudosPass();
 FunctionPass *createARCOptAddrMode();
 FunctionPass *createARCBranchFinalizePass();
+void initializeARCAsmPrinterPass(PassRegistry &);
 void initializeARCDAGToDAGISelLegacyPass(PassRegistry &);
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
index cee2fc7..55de401 100644
--- a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
+++ b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
@@ -35,9 +35,11 @@ class ARCAsmPrinter : public AsmPrinter {
   ARCMCInstLower MCInstLowering;
 
 public:
+  static char ID;
+
   explicit ARCAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)),
+      : AsmPrinter(TM, std::move(Streamer), ID),
         MCInstLowering(&OutContext, *this) {}
 
   StringRef getPassName() const override { return "ARC Assembly Printer"; }
@@ -72,6 +74,11 @@ bool ARCAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return AsmPrinter::runOnMachineFunction(MF);
 }
 
+char ARCAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(ARCAsmPrinter, "arc-asm-printer", "ARC Assmebly Printer", false,
+                false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARCAsmPrinter() {
   RegisterAsmPrinter<ARCAsmPrinter> X(getTheARCTarget());
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index f781fa9..3703363 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -98,6 +98,7 @@ MachineFunctionInfo *ARCTargetMachine::createMachineFunctionInfo(
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARCTarget() {
   RegisterTargetMachine<ARCTargetMachine> X(getTheARCTarget());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeARCAsmPrinterPass(PR);
   initializeARCDAGToDAGISelLegacyPass(PR);
 }
 
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
index 0b7045e..3847f4e 100644
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -61,6 +61,7 @@ FunctionPass *createARMFixCortexA57AES1742098Pass();
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
+void initializeARMAsmPrinterPass(PassRegistry &);
 void initializeARMBlockPlacementPass(PassRegistry &);
 void initializeARMBranchTargetsPass(PassRegistry &);
 void initializeARMConstantIslandsPass(PassRegistry &);
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index a3bb378..b71a1fa 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -50,7 +50,7 @@ using namespace llvm;
 
 ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
                              std::unique_ptr<MCStreamer> Streamer)
-    : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr), AFI(nullptr),
+    : AsmPrinter(TM, std::move(Streamer), ID), Subtarget(nullptr), AFI(nullptr),
       MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {}
 
 void ARMAsmPrinter::emitFunctionBodyEnd() {
@@ -2434,6 +2434,11 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+char ARMAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(ARMAsmPrinter, "arm-asm-printer", "ARM Assembly Printer", false,
+                false)
+
 //===----------------------------------------------------------------------===//
 // Target Registry Stuff
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index c4503d9..8a7ec4e 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -29,7 +29,10 @@ namespace ARM {
 }
 
 class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
+public:
+  static char ID;
 
+private:
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when printing asm code for different targets.
   const ARMSubtarget *Subtarget;
@@ -152,6 +155,7 @@ public:
   /// the .s file.
   void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
 };
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 2f9720e..80a1e77 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -91,6 +91,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
 
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeGlobalISel(Registry);
+  initializeARMAsmPrinterPass(Registry);
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
   initializeARMParallelDSPPass(Registry);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index dae0838..3569036 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1169,8 +1169,8 @@ int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
       return -1;
 
     const unsigned Size = C->getValue().getZExtValue();
-    const Align DstAlign = *MC->getDestAlign();
-    const Align SrcAlign = *MC->getSourceAlign();
+    const Align DstAlign = MC->getDestAlign().valueOrOne();
+    const Align SrcAlign = MC->getSourceAlign().valueOrOne();
 
     MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
                       /*IsVolatile*/ false);
@@ -1184,7 +1184,7 @@ int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
       return -1;
 
     const unsigned Size = C->getValue().getZExtValue();
-    const Align DstAlign = *MS->getDestAlign();
+    const Align DstAlign = MS->getDestAlign().valueOrOne();
 
     MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
                      /*IsZeroMemset*/ false, /*IsVolatile*/ false);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 28e5840..036a859 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -41,7 +41,7 @@ namespace ARM_AM {
 
   inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; }
 
-  inline const StringRef getShiftOpcStr(ShiftOpc Op) {
+  inline StringRef getShiftOpcStr(ShiftOpc Op) {
     switch (Op) {
     default: llvm_unreachable("Unknown shift opc!");
     case ARM_AM::asr: return "asr";
diff --git a/llvm/lib/Target/AVR/AVR.h b/llvm/lib/Target/AVR/AVR.h
index 0e67bb4..68ec927 100644
--- a/llvm/lib/Target/AVR/AVR.h
+++ b/llvm/lib/Target/AVR/AVR.h
@@ -31,6 +31,7 @@ FunctionPass *createAVRExpandPseudoPass();
 FunctionPass *createAVRFrameAnalyzerPass();
 FunctionPass *createAVRBranchSelectionPass();
 
+void initializeAVRAsmPrinterPass(PassRegistry &);
 void initializeAVRDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeAVRExpandPseudoPass(PassRegistry &);
 void initializeAVRShiftExpandPass(PassRegistry &);
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index a8621ab..ed537f8 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -39,13 +39,15 @@
 
 #define DEBUG_TYPE "avr-asm-printer"
 
-namespace llvm {
+using namespace llvm;
+
+namespace {
 
 /// An AVR assembly code printer.
 class AVRAsmPrinter : public AsmPrinter {
 public:
   AVRAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), MRI(*TM.getMCRegisterInfo()) {}
+      : AsmPrinter(TM, std::move(Streamer), ID), MRI(*TM.getMCRegisterInfo()) {}
 
   StringRef getPassName() const override { return "AVR Assembly Printer"; }
 
@@ -68,11 +70,15 @@ public:
 
   void emitStartOfAsmFile(Module &M) override;
 
+  static char ID;
+
 private:
   const MCRegisterInfo &MRI;
   bool EmittedStructorSymbolAttrs = false;
 };
 
+} // namespace
+
 void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                  raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
@@ -324,8 +330,11 @@ void AVRAsmPrinter::emitStartOfAsmFile(Module &M) {
         MCConstantExpr::create(SubTM->getIORegRAMPZ(), MMI->getContext()));
 }
 
-} // end of namespace llvm
+char AVRAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(AVRAsmPrinter, "avr-asm-printer", "AVR Assembly Printer", false,
+                false)
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmPrinter() {
-  llvm::RegisterAsmPrinter<llvm::AVRAsmPrinter> X(llvm::getTheAVRTarget());
+  llvm::RegisterAsmPrinter<AVRAsmPrinter> X(getTheAVRTarget());
 }
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 579f7ac..5eaa642 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -92,6 +92,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
   RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
 
   auto &PR = *PassRegistry::getPassRegistry();
+  initializeAVRAsmPrinterPass(PR);
   initializeAVRExpandPseudoPass(PR);
   initializeAVRShiftExpandPass(PR);
   initializeAVRDAGToDAGISelLegacyPass(PR);
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 494445f..2e4819e 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -49,7 +49,9 @@ class BPFAsmParser : public MCTargetAsmParser {
   bool equalIsAsmAssignment() override { return false; }
   // "*" is used for dereferencing memory that it will be the start of
   // statement.
-  bool starIsStartOfStatement() override { return true; }
+  bool tokenIsStartOfStatement(AsmToken::TokenKind Token) override {
+    return Token == AsmToken::Star;
+  }
 
 #define GET_ASSEMBLER_HEADER
 #include "BPFGenAsmMatcher.inc"
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index f07ae4c..68166e5 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -34,6 +34,7 @@ InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &,
                                                   const BPFSubtarget &,
                                                   const BPFRegisterBankInfo &);
 
+void initializeBPFAsmPrinterPass(PassRegistry &);
 void initializeBPFCheckAndAdjustIRPass(PassRegistry&);
 void initializeBPFDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeBPFMIPeepholePass(PassRegistry &);
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index b3c27a3..5dd71cc 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -37,7 +37,7 @@ class BPFAsmPrinter : public AsmPrinter {
 public:
   explicit BPFAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), BTF(nullptr) {}
+      : AsmPrinter(TM, std::move(Streamer), ID), BTF(nullptr) {}
 
   StringRef getPassName() const override { return "BPF Assembly Printer"; }
   bool doInitialization(Module &M) override;
@@ -49,6 +49,8 @@ public:
 
   void emitInstruction(const MachineInstr *MI) override;
 
+  static char ID;
+
 private:
   BTFDebug *BTF;
 };
@@ -147,6 +149,11 @@ void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+char BPFAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(BPFAsmPrinter, "bpf-asm-printer", "BPF Assembly Printer", false,
+                false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmPrinter() {
   RegisterAsmPrinter<BPFAsmPrinter> X(getTheBPFleTarget());
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 873719e..46ba758 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -45,6 +45,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeGlobalISel(PR);
+  initializeBPFAsmPrinterPass(PR);
   initializeBPFCheckAndAdjustIRPass(PR);
   initializeBPFMIPeepholePass(PR);
   initializeBPFMIPreEmitPeepholePass(PR);
diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h
index 11b6f0d..109aba5 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/llvm/lib/Target/Hexagon/Hexagon.h
@@ -26,6 +26,7 @@ class Pass;
 extern char &HexagonCopyHoistingID;
 extern char &HexagonExpandCondsetsID;
 extern char &HexagonTfrCleanupID;
+void initializeHexagonAsmPrinterPass(PassRegistry &);
 void initializeHexagonBitSimplifyPass(PassRegistry &);
 void initializeHexagonBranchRelaxationPass(PassRegistry &);
 void initializeHexagonCFGOptimizerPass(PassRegistry &);
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index c6f2503..c7580d2 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -853,6 +853,11 @@ void HexagonAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) {
   EmitSled(MI, SledKind::TAIL_CALL);
 }
 
+char HexagonAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(HexagonAsmPrinter, "hexagon-asm-printer",
+                "Hexagon Assembly Printer", false, false)
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmPrinter() {
   RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget());
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
index b555c88..8e34d2b 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -27,6 +27,10 @@ class raw_ostream;
 class TargetMachine;
 
   class HexagonAsmPrinter : public AsmPrinter {
+  public:
+    static char ID;
+
+  private:
     const HexagonSubtarget *Subtarget = nullptr;
 
     void emitAttributes();
@@ -34,7 +38,7 @@ class TargetMachine;
   public:
     explicit HexagonAsmPrinter(TargetMachine &TM,
                                std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+        : AsmPrinter(TM, std::move(Streamer), ID) {}
 
     bool runOnMachineFunction(MachineFunction &Fn) override {
       Subtarget = &Fn.getSubtarget<HexagonSubtarget>();
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index aa3491b..19b7c6a 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -179,6 +179,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
   RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeHexagonAsmPrinterPass(PR);
   initializeHexagonBitSimplifyPass(PR);
   initializeHexagonConstExtendersPass(PR);
   initializeHexagonConstPropagationPass(PR);
diff --git a/llvm/lib/Target/Lanai/Lanai.h b/llvm/lib/Target/Lanai/Lanai.h
index 1ef4462..032e5fe 100644
--- a/llvm/lib/Target/Lanai/Lanai.h
+++ b/llvm/lib/Target/Lanai/Lanai.h
@@ -37,6 +37,7 @@ FunctionPass *createLanaiMemAluCombinerPass();
 // operations.
 FunctionPass *createLanaiSetflagAluCombinerPass();
 
+void initializeLanaiAsmPrinterPass(PassRegistry &);
 void initializeLanaiDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeLanaiMemAluCombinerPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
index c87ad63..1c4fc57 100644
--- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -38,7 +38,7 @@ class LanaiAsmPrinter : public AsmPrinter {
 public:
   explicit LanaiAsmPrinter(TargetMachine &TM,
                            std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+      : AsmPrinter(TM, std::move(Streamer), ID) {}
 
   StringRef getPassName() const override { return "Lanai Assembly Printer"; }
 
@@ -52,6 +52,9 @@ public:
 private:
   void customEmitInstruction(const MachineInstr *MI);
   void emitCallInstruction(const MachineInstr *MI);
+
+public:
+  static char ID;
 };
 } // end of anonymous namespace
 
@@ -233,6 +236,11 @@ bool LanaiAsmPrinter::isBlockOnlyReachableByFallthrough(
   return !I->isBarrier();
 }
 
+char LanaiAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(LanaiAsmPrinter, "lanai-asm-printer", "Lanai Assembly Printer",
+                false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiAsmPrinter() {
   RegisterAsmPrinter<LanaiAsmPrinter> X(getTheLanaiTarget());
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index e728652..7f94e77 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -31,6 +31,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
   RegisterTargetMachine<LanaiTargetMachine> registered_target(
       getTheLanaiTarget());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeLanaiAsmPrinterPass(PR);
   initializeLanaiDAGToDAGISelLegacyPass(PR);
   initializeLanaiMemAluCombinerPass(PR);
 }
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index faac1a2..39c5e03 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -517,9 +517,7 @@ public:
     int64_t Imm;
     LoongArchMCExpr::Specifier VK = LoongArchMCExpr::VK_None;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
-    bool IsValidKind =
-        VK == LoongArchMCExpr::VK_None || VK == LoongArchMCExpr::VK_CALL ||
-        VK == LoongArchMCExpr::VK_CALL_PLT || VK == ELF::R_LARCH_B26;
+    bool IsValidKind = VK == LoongArchMCExpr::VK_None || VK == ELF::R_LARCH_B26;
     return IsConstantImm
                ? isShiftedInt<26, 2>(Imm) && IsValidKind
                : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
@@ -793,7 +791,6 @@ ParseStatus LoongArchAsmParser::parseSImm26Operand(OperandVector &Operands) {
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
   Res = MCSymbolRefExpr::create(Sym, getContext());
-  Res = LoongArchMCExpr::create(Res, LoongArchMCExpr::VK_CALL, getContext());
   Operands.push_back(LoongArchOperand::createImm(Res, S, E));
   return ParseStatus::Success;
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArch.h b/llvm/lib/Target/LoongArch/LoongArch.h
index db60523..b9dc782 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.h
+++ b/llvm/lib/Target/LoongArch/LoongArch.h
@@ -40,6 +40,7 @@ FunctionPass *createLoongArchMergeBaseOffsetOptPass();
 FunctionPass *createLoongArchOptWInstrsPass();
 FunctionPass *createLoongArchPreRAExpandPseudoPass();
 FunctionPass *createLoongArchExpandPseudoPass();
+void initializeLoongArchAsmPrinterPass(PassRegistry &);
 void initializeLoongArchDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeLoongArchDeadRegisterDefinitionsPass(PassRegistry &);
 void initializeLoongArchExpandAtomicPseudoPass(PassRegistry &);
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 895a8e2..0672570 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -297,6 +297,11 @@ bool LoongArchAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
+char LoongArchAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(LoongArchAsmPrinter, "loongarch-asm-printer",
+                "LoongArch Assembly Printer", false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmPrinter() {
   RegisterAsmPrinter<LoongArchAsmPrinter> X(getTheLoongArch32Target());
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
index 312631e..b2373a9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
@@ -22,12 +22,16 @@
 namespace llvm {
 
 class LLVM_LIBRARY_VISIBILITY LoongArchAsmPrinter : public AsmPrinter {
+public:
+  static char ID;
+
+private:
   const MCSubtargetInfo *STI;
 
 public:
   explicit LoongArchAsmPrinter(TargetMachine &TM,
                                std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), STI(TM.getMCSubtargetInfo()) {}
+      : AsmPrinter(TM, std::move(Streamer), ID), STI(TM.getMCSubtargetInfo()) {}
 
   StringRef getPassName() const override {
     return "LoongArch Assembly Printer";
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index e426892..fe08c10 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1815,24 +1815,24 @@ def : Pat<(v4f32 (uint_to_fp v4i64:$vj)),
 // XVFTINTRZ_{W_S/L_D}
 def : Pat<(v8i32 (fp_to_sint v8f32:$vj)), (XVFTINTRZ_W_S v8f32:$vj)>;
 def : Pat<(v4i64 (fp_to_sint v4f64:$vj)), (XVFTINTRZ_L_D v4f64:$vj)>;
-def : Pat<(v4i64 (fp_to_sint v4f32:$vj)),
-          (VEXT2XV_D_W (SUBREG_TO_REG (i64 0), (VFTINTRZ_W_S v4f32:$vj),
-                                      sub_128))>;
-def : Pat<(v4i32 (fp_to_sint (v4f64 LASX256:$vj))),
-          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
-                                                     v4f64:$vj)),
-                          sub_128)>;
+def : Pat<(v4i64(fp_to_sint v4f32:$vj)), (VEXT2XV_D_W(SUBREG_TO_REG(i64 0),
+                                             (VFTINTRZ_W_S v4f32:$vj),
+                                             sub_128))>;
+def : Pat<(v4i32(fp_to_sint v4f64:$vj)),
+          (EXTRACT_SUBREG(XVPICKEV_W(XVPERMI_D(XVFTINTRZ_L_D v4f64:$vj), 238),
+               (XVFTINTRZ_L_D v4f64:$vj)),
+              sub_128)>;
 
 // XVFTINTRZ_{W_SU/L_DU}
 def : Pat<(v8i32 (fp_to_uint v8f32:$vj)), (XVFTINTRZ_WU_S v8f32:$vj)>;
 def : Pat<(v4i64 (fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)>;
-def : Pat<(v4i64 (fp_to_uint v4f32:$vj)),
-          (VEXT2XV_DU_WU (SUBREG_TO_REG (i64 0), (VFTINTRZ_WU_S v4f32:$vj),
-                                        sub_128))>;
-def : Pat<(v4i32 (fp_to_uint (v4f64 LASX256:$vj))),
-          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
-                                                     v4f64:$vj)),
-                          sub_128)>;
+def : Pat<(v4i64(fp_to_uint v4f32:$vj)), (VEXT2XV_DU_WU(SUBREG_TO_REG(i64 0),
+                                             (VFTINTRZ_WU_S v4f32:$vj),
+                                             sub_128))>;
+def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
+          (EXTRACT_SUBREG(XVPICKEV_W(XVPERMI_D(XVFTINTRZ_LU_D v4f64:$vj), 238),
+               (XVFTINTRZ_LU_D v4f64:$vj)),
+              sub_128)>;
 
 // XVPERMI_Q
 foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index ae76463..24bf5a1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -35,10 +35,8 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
     Kind = LoongArchMCExpr::VK_None;
     break;
   case LoongArchII::MO_CALL:
-    Kind = LoongArchMCExpr::VK_CALL;
-    break;
   case LoongArchII::MO_CALL_PLT:
-    Kind = LoongArchMCExpr::VK_CALL_PLT;
+    Kind = ELF::R_LARCH_B26;
     break;
   case LoongArchII::MO_PCREL_HI:
     Kind = ELF::R_LARCH_PCALA_HI20;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index d16fb5c..5770a76 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -147,8 +147,6 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
       FixupKind = LoongArch::fixup_loongarch_b21;
       break;
     case ELF::R_LARCH_B26:
-    case LoongArchMCExpr::VK_CALL:
-    case LoongArchMCExpr::VK_CALL_PLT:
       FixupKind = LoongArch::fixup_loongarch_b26;
       break;
     case ELF::R_LARCH_ABS_HI20:
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index ac39b1f..994d248 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -32,7 +32,7 @@ const LoongArchMCExpr *LoongArchMCExpr::create(const MCExpr *Expr, uint16_t S,
 
 void LoongArchMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   Specifier S = getSpecifier();
-  bool HasVariant = S != VK_None && S != VK_CALL;
+  bool HasVariant = S != VK_None && S != ELF::R_LARCH_B26;
 
   if (HasVariant)
     OS << '%' << getSpecifierName(specifier) << '(';
@@ -63,14 +63,10 @@ StringRef LoongArchMCExpr::getSpecifierName(uint16_t S) {
   switch (S) {
   default:
     llvm_unreachable("Invalid ELF symbol kind");
-  case VK_CALL_PLT:
-    return "plt";
   case ELF::R_LARCH_B16:
     return "b16";
   case ELF::R_LARCH_B21:
     return "b21";
-  case ELF::R_LARCH_B26:
-    return "b26";
   case ELF::R_LARCH_ABS_HI20:
     return "abs_hi20";
   case ELF::R_LARCH_ABS_LO12:
@@ -176,7 +172,7 @@ StringRef LoongArchMCExpr::getSpecifierName(uint16_t S) {
 
 LoongArchMCExpr::Specifier LoongArchMCExpr::parseSpecifier(StringRef name) {
   return StringSwitch<LoongArchMCExpr::Specifier>(name)
-      .Case("plt", VK_CALL_PLT)
+      .Case("plt", ELF::R_LARCH_B26)
       .Case("b16", ELF::R_LARCH_B16)
       .Case("b21", ELF::R_LARCH_B21)
       .Case("b26", ELF::R_LARCH_B26)
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index aac4997..06ebbc0 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -23,11 +23,7 @@ class StringRef;
 class LoongArchMCExpr : public MCTargetExpr {
 public:
   using Specifier = uint16_t;
-  enum {
-    VK_None,
-    VK_CALL = 1000, // larger than relocation types
-    VK_CALL_PLT,
-  };
+  enum { VK_None };
 
 private:
   const MCExpr *Expr;
diff --git a/llvm/lib/Target/M68k/M68k.h b/llvm/lib/Target/M68k/M68k.h
index 5db9d79..0dbca6f 100644
--- a/llvm/lib/Target/M68k/M68k.h
+++ b/llvm/lib/Target/M68k/M68k.h
@@ -46,6 +46,7 @@ InstructionSelector *
 createM68kInstructionSelector(const M68kTargetMachine &, const M68kSubtarget &,
                               const M68kRegisterBankInfo &);
 
+void initializeM68kAsmPrinterPass(PassRegistry &);
 void initializeM68kDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeM68kExpandPseudoPass(PassRegistry &);
 void initializeM68kGlobalBaseRegPass(PassRegistry &);
diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
index f748450..0437400 100644
--- a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
+++ b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
@@ -195,6 +195,11 @@ void M68kAsmPrinter::emitStartOfAsmFile(Module &M) {
 
 void M68kAsmPrinter::emitEndOfAsmFile(Module &M) {}
 
+char M68kAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(M68kAsmPrinter, "m68k-asm-printer", "M68k Assembly Printer",
+                false, false)
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kAsmPrinter() {
   RegisterAsmPrinter<M68kAsmPrinter> X(getTheM68kTarget());
 }
diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.h b/llvm/lib/Target/M68k/M68kAsmPrinter.h
index 7b4dbfe..bd15987 100644
--- a/llvm/lib/Target/M68k/M68kAsmPrinter.h
+++ b/llvm/lib/Target/M68k/M68kAsmPrinter.h
@@ -49,13 +49,15 @@ class LLVM_LIBRARY_VISIBILITY M68kAsmPrinter
   void printAbsMem(const MachineInstr *MI, unsigned OpNum, raw_ostream &OS);
 
 public:
+  static char ID;
+
   const M68kSubtarget *Subtarget;
   const M68kMachineFunctionInfo *MMFI;
   std::unique_ptr<M68kMCInstLower> MCInstLowering;
 
   explicit M68kAsmPrinter(TargetMachine &TM,
                           std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {
+      : AsmPrinter(TM, std::move(Streamer), ID) {
     Subtarget = static_cast<M68kTargetMachine &>(TM).getSubtargetImpl();
   }
 
diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
index 2f5a2e8..ce15ee6 100644
--- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp
+++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
@@ -37,6 +37,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTarget() {
   RegisterTargetMachine<M68kTargetMachine> X(getTheM68kTarget());
   auto *PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
+  initializeM68kAsmPrinterPass(*PR);
   initializeM68kDAGToDAGISelLegacyPass(*PR);
   initializeM68kExpandPseudoPass(*PR);
   initializeM68kGlobalBaseRegPass(*PR);
diff --git a/llvm/lib/Target/MSP430/MSP430.h b/llvm/lib/Target/MSP430/MSP430.h
index 0198359..3680d18 100644
--- a/llvm/lib/Target/MSP430/MSP430.h
+++ b/llvm/lib/Target/MSP430/MSP430.h
@@ -43,6 +43,7 @@ FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
 
 FunctionPass *createMSP430BranchSelectionPass();
 
+void initializeMSP430AsmPrinterPass(PassRegistry &);
 void initializeMSP430DAGToDAGISelLegacyPass(PassRegistry &);
 
 } // namespace llvm
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 2731987..44e55b6 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -36,7 +36,7 @@ namespace {
   class MSP430AsmPrinter : public AsmPrinter {
   public:
     MSP430AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-        : AsmPrinter(TM, std::move(Streamer)) {}
+        : AsmPrinter(TM, std::move(Streamer), ID) {}
 
     StringRef getPassName() const override { return "MSP430 Assembly Printer"; }
 
@@ -54,6 +54,8 @@ namespace {
     void emitInstruction(const MachineInstr *MI) override;
 
     void EmitInterruptVectorSection(MachineFunction &ISR);
+
+    static char ID;
   };
 } // end of anonymous namespace
 
@@ -181,6 +183,11 @@ bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
+char MSP430AsmPrinter::ID = 0;
+
+INITIALIZE_PASS(MSP430AsmPrinter, "msp430-asm-printer",
+                "MSP430 Assembly Printer", false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmPrinter() {
   RegisterAsmPrinter<MSP430AsmPrinter> X(getTheMSP430Target());
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 6b69f2c..763a2db 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -25,6 +25,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
   // Register the target.
   RegisterTargetMachine<MSP430TargetMachine> X(getTheMSP430Target());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeMSP430AsmPrinterPass(PR);
   initializeMSP430DAGToDAGISelLegacyPass(PR);
 }
 
diff --git a/llvm/lib/Target/Mips/Mips.h b/llvm/lib/Target/Mips/Mips.h
index f99dadd..60d5114 100644
--- a/llvm/lib/Target/Mips/Mips.h
+++ b/llvm/lib/Target/Mips/Mips.h
@@ -57,6 +57,7 @@ createMipsInstructionSelector(const MipsTargetMachine &, const MipsSubtarget &,
                               const MipsRegisterBankInfo &);
 
 void initializeMicroMipsSizeReducePass(PassRegistry &);
+void initializeMipsAsmPrinterPass(PassRegistry &);
 void initializeMipsBranchExpansionPass(PassRegistry &);
 void initializeMipsDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeMipsDelaySlotFillerPass(PassRegistry &);
diff --git a/llvm/lib/Target/Mips/Mips.td b/llvm/lib/Target/Mips/Mips.td
index 99415bc..b346ba9 100644
--- a/llvm/lib/Target/Mips/Mips.td
+++ b/llvm/lib/Target/Mips/Mips.td
@@ -243,11 +243,11 @@ def ImplP5600 : SubtargetFeature<"p5600", "ProcImpl",
 // same CPU architecture.
 def ImplI6400
     : SubtargetFeature<"i6400", "ProcImpl", "MipsSubtarget::CPU::I6400",
-                       "MIPS I6400 Processor", [FeatureMips64r6]>;
+                       "MIPS I6400 Processor", [FeatureMips64r6, FeatureMSA]>;
 
 def ImplI6500
     : SubtargetFeature<"i6500", "ProcImpl", "MipsSubtarget::CPU::I6500",
-                       "MIPS I6500 Processor", [FeatureMips64r6]>;
+                       "MIPS I6500 Processor", [FeatureMips64r6, FeatureMSA]>;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, MipsGenericModel, Features>;
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 8233fdf..1a3e99e 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1292,6 +1292,11 @@ bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
           || Opcode == Mips::LONG_BRANCH_DADDiu2Op);
 }
 
+char MipsAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(MipsAsmPrinter, "mips-asm-printer", "Mips Assembly Printer",
+                false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(getTheMipsTarget());
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index 5a878e4b..bbaa3b3 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -117,13 +117,15 @@ private:
   bool isLongBranchPseudo(int Opcode) const;
 
 public:
+  static char ID;
+
   const MipsSubtarget *Subtarget;
   const MipsFunctionInfo *MipsFI;
   MipsMCInstLower MCInstLowering;
 
   explicit MipsAsmPrinter(TargetMachine &TM,
                           std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(*this) {}
+      : AsmPrinter(TM, std::move(Streamer), ID), MCInstLowering(*this) {}
 
   StringRef getPassName() const override { return "Mips Assembly Printer"; }
 
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 55fc636..72f21a0 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -521,7 +521,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND,
-                       ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL});
+                       ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL,
+                       ISD::SIGN_EXTEND});
 
   if (Subtarget.isGP64bit())
     setMaxAtomicSizeInBitsSupported(64);
@@ -1221,6 +1222,37 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getConstant(SMSize, DL, MVT::i32));
 }
 
+static SDValue performSignExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const MipsSubtarget &Subtarget) {
+  if (DCI.Level != AfterLegalizeDAG || !Subtarget.isGP64bit()) {
+    return SDValue();
+  }
+
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Pattern match XOR.
+  // $dst = sign_extend (xor (trunc $src, i32), imm)
+  // => $dst = xor (signext_inreg $src, i32), imm
+  if (N0.getOpcode() == ISD::XOR &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue TruncateSource = N0.getOperand(0).getOperand(0);
+    auto *ConstantOperand = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+
+    SDValue FirstOperand =
+        DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N0), VT, TruncateSource,
+                    DAG.getValueType(N0.getOperand(0).getValueType()));
+
+    int64_t ConstImm = ConstantOperand->getSExtValue();
+    return DAG.getNode(ISD::XOR, SDLoc(N0), VT, FirstOperand,
+                       DAG.getConstant(ConstImm, SDLoc(N0), VT));
+  }
+
+  return SDValue();
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -1246,6 +1278,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performSHLCombine(N, DAG, DCI, Subtarget);
   case ISD::SUB:
     return performSUBCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND:
+    return performSignExtendCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 9c6cccb..30b4d50 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -60,6 +60,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
 
   PassRegistry *PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
+  initializeMipsAsmPrinterPass(*PR);
   initializeMipsDelaySlotFillerPass(*PR);
   initializeMipsBranchExpansionPass(*PR);
   initializeMicroMipsSizeReducePass(*PR);
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 1da979d..83090ab 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -59,6 +59,7 @@ MachineFunctionPass *createNVPTXForwardParamsPass();
 void initializeNVVMReflectLegacyPassPass(PassRegistry &);
 void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+void initializeNVPTXAsmPrinterPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
 void initializeNVPTXAtomicLowerPass(PassRegistry &);
 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
index a82c3aa..430fcd7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
@@ -85,16 +85,23 @@ public:
 
 // Wrapper around ExternalAAWrapperPass so that the default
 // constructor gets the callback.
+// Note that NVPTXAA will run before BasicAA for compile time considerations.
 class NVPTXExternalAAWrapper : public ExternalAAWrapperPass {
 public:
   static char ID;
 
+  bool runEarly() override { return true; }
+
   NVPTXExternalAAWrapper()
       : ExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
           if (auto *WrapperPass =
                   P.getAnalysisIfAvailable<NVPTXAAWrapperPass>())
             AAR.addAAResult(WrapperPass->getResult());
         }) {}
+
+  StringRef getPassName() const override {
+    return "NVPTX Address space based Alias Analysis Wrapper";
+  }
 };
 
 ImmutablePass *createNVPTXAAWrapperPass();
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 2f4b109..0e5207cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -2013,6 +2013,11 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, unsigned OpNum,
   }
 }
 
+char NVPTXAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(NVPTXAsmPrinter, "nvptx-asm-printer", "NVPTX Assembly Printer",
+                false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 65938a9..f359318 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -145,9 +145,12 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
 
   friend class AggBuffer;
 
-private:
+public:
+  static char ID;
+
   StringRef getPassName() const override { return "NVPTX Assembly Printer"; }
 
+private:
   const Function *F;
 
   void emitStartOfAsmFile(Module &M) override;
@@ -239,7 +242,7 @@ private:
 
 public:
   NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)),
+      : AsmPrinter(TM, std::move(Streamer), ID),
         EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
                     NVPTX::CUDA) {}
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index dc3afc1..85d28a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -99,6 +99,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   initializeNVVMIntrRangePass(PR);
   initializeGenericToNVVMLegacyPassPass(PR);
   initializeNVPTXAllocaHoistingPass(PR);
+  initializeNVPTXAsmPrinterPass(PR);
   initializeNVPTXAssignValidGlobalNamesPass(PR);
   initializeNVPTXAtomicLowerPass(PR);
   initializeNVPTXLowerArgsLegacyPassPass(PR);
@@ -233,7 +234,7 @@ MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo(
                                                                     F, STI);
 }
 
-void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
+void NVPTXTargetMachine::registerEarlyDefaultAliasAnalyses(AAManager &AAM) {
   AAM.registerFunctionAnalysis<NVPTXAA>();
 }
 
@@ -348,10 +349,7 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&RemoveLoadsIntoFakeUsesID);
 
   addPass(createNVPTXAAWrapperPass());
-  addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
-    if (auto *WrapperPass = P.getAnalysisIfAvailable<NVPTXAAWrapperPass>())
-      AAR.addAAResult(WrapperPass->getResult());
-  }));
+  addPass(createNVPTXExternalAAWrapperPass());
 
   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
   // it here does nothing.  But since we need it for correctness when lowering
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 34d841c..118a01a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -64,7 +64,7 @@ public:
   createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
                             const TargetSubtargetInfo *STI) const override;
 
-  void registerDefaultAliasAnalyses(AAManager &AAM) override;
+  void registerEarlyDefaultAliasAnalyses(AAManager &AAM) override;
 
   void registerPassBuilderCallbacks(PassBuilder &PB) override;
 
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index d95ee56..124dac4 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -78,6 +78,8 @@ class ModulePass;
   void initializePPCExpandAtomicPseudoPass(PassRegistry &);
   void initializePPCCTRLoopsPass(PassRegistry &);
   void initializePPCDAGToDAGISelLegacyPass(PassRegistry &);
+  void initializePPCLinuxAsmPrinterPass(PassRegistry &);
+  void initializePPCAIXAsmPrinterPass(PassRegistry &);
 
   extern char &PPCVSXFMAMutateID;
 
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 85155c9..0fe615a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -159,8 +159,8 @@ protected:
 
 public:
   explicit PPCAsmPrinter(TargetMachine &TM,
-                         std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+                         std::unique_ptr<MCStreamer> Streamer, char &ID)
+      : AsmPrinter(TM, std::move(Streamer), ID) {}
 
   StringRef getPassName() const override { return "PowerPC Assembly Printer"; }
 
@@ -216,9 +216,11 @@ public:
 /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
 class PPCLinuxAsmPrinter : public PPCAsmPrinter {
 public:
+  static char ID;
+
   explicit PPCLinuxAsmPrinter(TargetMachine &TM,
                               std::unique_ptr<MCStreamer> Streamer)
-      : PPCAsmPrinter(TM, std::move(Streamer)) {}
+      : PPCAsmPrinter(TM, std::move(Streamer), ID) {}
 
   StringRef getPassName() const override {
     return "Linux PPC Assembly Printer";
@@ -262,8 +264,10 @@ private:
   uint64_t getAliasOffset(const Constant *C);
 
 public:
+  static char ID;
+
   PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : PPCAsmPrinter(TM, std::move(Streamer)) {
+      : PPCAsmPrinter(TM, std::move(Streamer), ID) {
     if (MAI->isLittleEndian())
       report_fatal_error(
           "cannot create AIX PPC Assembly Printer for a little-endian target");
@@ -2219,6 +2223,11 @@ void PPCLinuxAsmPrinter::emitFunctionBodyEnd() {
   }
 }
 
+char PPCLinuxAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(PPCLinuxAsmPrinter, "ppc-linux-asm-printer",
+                "Linux PPC Assembly Printer", false, false)
+
 void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
                                    MCSymbol *GVSym) const {
   MCSymbolAttr LinkageAttr = MCSA_Invalid;
@@ -3369,6 +3378,11 @@ void PPCAIXAsmPrinter::emitModuleCommandLines(Module &M) {
   OutStreamer->emitXCOFFCInfoSym(".GCC.command.line", RSOS.str());
 }
 
+char PPCAIXAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(PPCAIXAsmPrinter, "ppc-aix-asm-printer",
+                "AIX PPC Assembly Printer", false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 71b874a..04cb8ea 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -144,6 +144,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
   initializeGlobalISel(PR);
   initializePPCCTRLoopsPass(PR);
   initializePPCDAGToDAGISelLegacyPass(PR);
+  initializePPCLinuxAsmPrinterPass(PR);
+  initializePPCAIXAsmPrinterPass(PR);
 }
 
 static bool isLittleEndianTriple(const Triple &T) {
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 06de862..ae94101 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -116,6 +116,8 @@ void initializeRISCVVLOptimizerPass(PassRegistry &);
 
 FunctionPass *createRISCVVMV0EliminationPass();
 void initializeRISCVVMV0EliminationPass(PassRegistry &);
+
+void initializeRISCVAsmPrinterPass(PassRegistry &);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 7249eca..e40bd18 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -55,12 +55,16 @@ extern const SubtargetFeatureKV RISCVFeatureKV[RISCV::NumSubtargetFeatures];
 
 namespace {
 class RISCVAsmPrinter : public AsmPrinter {
+public:
+  static char ID;
+
+private:
   const RISCVSubtarget *STI;
 
 public:
   explicit RISCVAsmPrinter(TargetMachine &TM,
                            std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+      : AsmPrinter(TM, std::move(Streamer), ID) {}
 
   StringRef getPassName() const override { return "RISC-V Assembly Printer"; }
 
@@ -1210,3 +1214,8 @@ void RISCVAsmPrinter::emitMachineConstantPoolValue(
   uint64_t Size = getDataLayout().getTypeAllocSize(RCPV->getType());
   OutStreamer->emitValue(Expr, Size);
 }
+
+char RISCVAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(RISCVAsmPrinter, "riscv-asm-printer", "RISC-V Assembly Printer",
+                false, false)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 86f8873..6319d0e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1752,6 +1752,13 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::riscv_seg6_load:
   case Intrinsic::riscv_seg7_load:
   case Intrinsic::riscv_seg8_load:
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask:
     return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_seg2_store:
@@ -1765,6 +1772,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
                                /*IsStore*/ true,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask:
+    // Operands are (vec, ..., vec, ptr, mask, vl)
+    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
+                               /*IsStore*/ true,
+                               /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_vle:
   case Intrinsic::riscv_vle_mask:
   case Intrinsic::riscv_vleff:
@@ -6971,7 +6989,7 @@ static bool hasPassthruOp(unsigned Opcode) {
          Opcode <= RISCVISD::LAST_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
   static_assert(
-      RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 134 &&
+      RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 137 &&
       RISCVISD::LAST_STRICTFP_OPCODE - RISCVISD::FIRST_STRICTFP_OPCODE == 21 &&
       "adding target specific op should update this function");
   if (Opcode >= RISCVISD::ADD_VL && Opcode <= RISCVISD::VFMAX_VL)
@@ -6995,7 +7013,7 @@ static bool hasMaskOp(unsigned Opcode) {
          Opcode <= RISCVISD::LAST_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
   static_assert(
-      RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 134 &&
+      RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 137 &&
       RISCVISD::LAST_STRICTFP_OPCODE - RISCVISD::FIRST_STRICTFP_OPCODE == 21 &&
       "adding target specific op should update this function");
   if (Opcode >= RISCVISD::TRUNCATE_VECTOR_VL && Opcode <= RISCVISD::SETCC_VL)
@@ -10579,13 +10597,20 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::riscv_seg5_load:
   case Intrinsic::riscv_seg6_load:
   case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load: {
+  case Intrinsic::riscv_seg8_load:
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask: {
     SDLoc DL(Op);
     static const Intrinsic::ID VlsegInts[7] = {
-        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
-        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
-        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-        Intrinsic::riscv_vlseg8};
+        Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+        Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+        Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+        Intrinsic::riscv_vlseg8_mask};
     unsigned NF = Op->getNumValues() - 1;
     assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
     MVT XLenVT = Subtarget.getXLenVT();
@@ -10595,7 +10620,19 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                   ContainerVT.getScalarSizeInBits();
     EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    // Masked: (pointer, mask, vl)
+    // Non-masked: (pointer, vl)
+    bool IsMasked = Op.getNumOperands() > 4;
+    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+    SDValue Mask =
+        IsMasked ? Op.getOperand(3) : getAllOnesMask(ContainerVT, VL, DL, DAG);
+    MVT MaskVT = Mask.getSimpleValueType();
+    if (MaskVT.isFixedLengthVector()) {
+      MVT MaskContainerVT =
+          ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+      Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+    }
+
     SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT);
     auto *Load = cast<MemIntrinsicSDNode>(Op);
 
@@ -10605,7 +10642,10 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
         IntID,
         DAG.getUNDEF(VecTupTy),
         Op.getOperand(2),
+        Mask,
         VL,
+        DAG.getTargetConstant(
+            RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT),
         DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
     SDValue Result =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
@@ -10665,15 +10705,39 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   case Intrinsic::riscv_seg5_store:
   case Intrinsic::riscv_seg6_store:
   case Intrinsic::riscv_seg7_store:
-  case Intrinsic::riscv_seg8_store: {
+  case Intrinsic::riscv_seg8_store:
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask: {
     SDLoc DL(Op);
     static const Intrinsic::ID VssegInts[] = {
-        Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
-        Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
-        Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
-        Intrinsic::riscv_vsseg8};
-    // Operands are (chain, int_id, vec*, ptr, vl)
-    unsigned NF = Op->getNumOperands() - 4;
+        Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+        Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+        Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+        Intrinsic::riscv_vsseg8_mask};
+
+    bool IsMasked = false;
+    switch (IntNo) {
+    case Intrinsic::riscv_seg2_store_mask:
+    case Intrinsic::riscv_seg3_store_mask:
+    case Intrinsic::riscv_seg4_store_mask:
+    case Intrinsic::riscv_seg5_store_mask:
+    case Intrinsic::riscv_seg6_store_mask:
+    case Intrinsic::riscv_seg7_store_mask:
+    case Intrinsic::riscv_seg8_store_mask:
+      IsMasked = true;
+      break;
+    default:
+      break;
+    }
+
+    // Non-masked: (chain, int_id, vec*, ptr, vl)
+    // Masked: (chain, int_id, vec*, ptr, mask, vl)
+    unsigned NF = Op->getNumOperands() - (IsMasked ? 5 : 4);
     assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
     MVT XLenVT = Subtarget.getXLenVT();
     MVT VT = Op->getOperand(2).getSimpleValueType();
@@ -10682,7 +10746,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                   ContainerVT.getScalarSizeInBits();
     EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+    SDValue Mask = IsMasked ? Op.getOperand(Op.getNumOperands() - 2)
+                            : getAllOnesMask(ContainerVT, VL, DL, DAG);
+    MVT MaskVT = Mask.getSimpleValueType();
+    if (MaskVT.isFixedLengthVector()) {
+      MVT MaskContainerVT =
+          ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+      Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+    }
+
     SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
     SDValue Ptr = Op->getOperand(NF + 2);
 
@@ -10701,6 +10774,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         IntID,
         StoredVal,
         Ptr,
+        Mask,
         VL,
         DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
 
@@ -18101,6 +18175,118 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getBuildVector(VT, DL, RHSOps));
 }
 
+static SDValue lowerVQDOT(unsigned Opc, SDValue Op0, SDValue Op1,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          const RISCVSubtarget &Subtarget) {
+  assert(RISCVISD::VQDOT_VL == Opc || RISCVISD::VQDOTU_VL == Opc ||
+         RISCVISD::VQDOTSU_VL == Opc);
+  MVT VT = Op0.getSimpleValueType();
+  assert(VT == Op1.getSimpleValueType() &&
+         VT.getVectorElementType() == MVT::i32);
+
+  assert(VT.isFixedLengthVector());
+  MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+  SDValue Passthru = convertToScalableVector(
+      ContainerVT, DAG.getConstant(0, DL, VT), DAG, Subtarget);
+  Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget);
+  Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
+
+  auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+  const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
+  SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
+  SDValue LocalAccum = DAG.getNode(Opc, DL, ContainerVT,
+                                   {Op0, Op1, Passthru, Mask, VL, PolicyOp});
+  return convertFromScalableVector(VT, LocalAccum, DAG, Subtarget);
+}
+
+static MVT getQDOTXResultType(MVT OpVT) {
+  ElementCount OpEC = OpVT.getVectorElementCount();
+  assert(OpEC.isKnownMultipleOf(4) && OpVT.getVectorElementType() == MVT::i8);
+  return MVT::getVectorVT(MVT::i32, OpEC.divideCoefficientBy(4));
+}
+
+static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
+                                         SelectionDAG &DAG,
+                                         const RISCVSubtarget &Subtarget,
+                                         const RISCVTargetLowering &TLI) {
+  // Note: We intentionally do not check the legality of the reduction type.
+  // We want to handle the m4/m8 *src*  types, and thus need to let illegal
+  // intermediate types flow through here.
+  if (InVec.getValueType().getVectorElementType() != MVT::i32 ||
+      !InVec.getValueType().getVectorElementCount().isKnownMultipleOf(4))
+    return SDValue();
+
+  // reduce (zext a) <--> reduce (mul zext a. zext 1)
+  // reduce (sext a) <--> reduce (mul sext a. sext 1)
+  if (InVec.getOpcode() == ISD::ZERO_EXTEND ||
+      InVec.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue A = InVec.getOperand(0);
+    if (A.getValueType().getVectorElementType() != MVT::i8 ||
+        !TLI.isTypeLegal(A.getValueType()))
+      return SDValue();
+
+    MVT ResVT = getQDOTXResultType(A.getSimpleValueType());
+    A = DAG.getBitcast(ResVT, A);
+    SDValue B = DAG.getConstant(0x01010101, DL, ResVT);
+
+    bool IsSigned = InVec.getOpcode() == ISD::SIGN_EXTEND;
+    unsigned Opc = IsSigned ? RISCVISD::VQDOT_VL : RISCVISD::VQDOTU_VL;
+    return lowerVQDOT(Opc, A, B, DL, DAG, Subtarget);
+  }
+
+  // mul (sext, sext) -> vqdot
+  // mul (zext, zext) -> vqdotu
+  // mul (sext, zext) -> vqdotsu
+  // mul (zext, sext) -> vqdotsu (swapped)
+  // TODO: Improve .vx handling - we end up with a sub-vector insert
+  // which confuses the splat pattern matching.  Also, match vqdotus.vx
+  if (InVec.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  SDValue A = InVec.getOperand(0);
+  SDValue B = InVec.getOperand(1);
+  unsigned Opc = 0;
+  if (A.getOpcode() == B.getOpcode()) {
+    if (A.getOpcode() == ISD::SIGN_EXTEND)
+      Opc = RISCVISD::VQDOT_VL;
+    else if (A.getOpcode() == ISD::ZERO_EXTEND)
+      Opc = RISCVISD::VQDOTU_VL;
+    else
+      return SDValue();
+  } else {
+    if (B.getOpcode() != ISD::ZERO_EXTEND)
+      std::swap(A, B);
+    if (A.getOpcode() != ISD::SIGN_EXTEND || B.getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    Opc = RISCVISD::VQDOTSU_VL;
+  }
+  assert(Opc);
+
+  if (A.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+      A.getOperand(0).getValueType() != B.getOperand(0).getValueType() ||
+      !TLI.isTypeLegal(A.getValueType()))
+    return SDValue();
+
+  MVT ResVT = getQDOTXResultType(A.getOperand(0).getSimpleValueType());
+  A = DAG.getBitcast(ResVT, A.getOperand(0));
+  B = DAG.getBitcast(ResVT, B.getOperand(0));
+  return lowerVQDOT(Opc, A, B, DL, DAG, Subtarget);
+}
+
+static SDValue performVECREDUCECombine(SDNode *N, SelectionDAG &DAG,
+                                       const RISCVSubtarget &Subtarget,
+                                       const RISCVTargetLowering &TLI) {
+  if (!Subtarget.hasStdExtZvqdotq())
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue InVec = N->getOperand(0);
+  if (SDValue V = foldReduceOperandViaVQDOT(InVec, DL, DAG, Subtarget, TLI))
+    return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, V);
+  return SDValue();
+}
+
 static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                                                const RISCVSubtarget &Subtarget,
                                                const RISCVTargetLowering &TLI) {
@@ -19878,8 +20064,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
 
     return SDValue();
   }
-  case ISD::CTPOP:
   case ISD::VECREDUCE_ADD:
+    if (SDValue V = performVECREDUCECombine(N, DAG, Subtarget, *this))
+      return V;
+    [[fallthrough]];
+  case ISD::CTPOP:
     if (SDValue V = combineToVCPOP(N, DAG, Subtarget))
       return V;
     break;
@@ -22401,6 +22590,9 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(RI_VUNZIP2A_VL)
   NODE_NAME_CASE(RI_VUNZIP2B_VL)
   NODE_NAME_CASE(RI_VEXTRACT)
+  NODE_NAME_CASE(VQDOT_VL)
+  NODE_NAME_CASE(VQDOTU_VL)
+  NODE_NAME_CASE(VQDOTSU_VL)
   NODE_NAME_CASE(READ_CSR)
   NODE_NAME_CASE(WRITE_CSR)
   NODE_NAME_CASE(SWAP_CSR)
@@ -23925,15 +24117,20 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
   if (N == 1)
     return true;
 
+  using namespace PatternMatch;
+  // Right now we're only recognizing the simplest pattern.
+  uint64_t C;
+  if (match(V, m_CombineOr(m_ConstantInt(C),
+                           m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
+      C && C % N == 0)
+    return true;
+
   if (isPowerOf2_32(N)) {
     KnownBits KB = llvm::computeKnownBits(V, DL);
     return KB.countMinTrailingZeros() >= Log2_32(N);
   }
 
-  using namespace PatternMatch;
-  // Right now we're only recognizing the simplest pattern.
-  uint64_t C;
-  return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0;
+  return false;
 }
 
 /// Lower an interleaved vp.load into a vlsegN intrinsic.
@@ -23965,7 +24162,7 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
 /// TODO: We probably can loosen the dependency on matching extractvalue when
 /// dealing with factor of 2 (extractvalue is still required for most of other
 /// factors though).
-bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
+bool RISCVTargetLowering::lowerInterleavedVPLoad(
     VPIntrinsic *Load, Value *Mask,
     ArrayRef<Value *> DeinterleaveResults) const {
   assert(Mask && "Expect a valid mask");
@@ -23974,27 +24171,21 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
 
   const unsigned Factor = DeinterleaveResults.size();
 
-  auto *WideVTy = dyn_cast<ScalableVectorType>(Load->getType());
-  // TODO: Support fixed vectors.
-  if (!WideVTy)
+  auto *VTy = dyn_cast<VectorType>(DeinterleaveResults[0]->getType());
+  if (!VTy)
     return false;
 
-  unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue();
-  assert(WideNumElements % Factor == 0 &&
-         "ElementCount of a wide load must be divisible by interleave factor");
-  auto *VTy =
-      VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor,
-                      WideVTy->isScalableTy());
   auto &DL = Load->getModule()->getDataLayout();
   Align Alignment = Load->getParamAlign(0).value_or(
-      DL.getABITypeAlign(WideVTy->getElementType()));
+      DL.getABITypeAlign(VTy->getElementType()));
   if (!isLegalInterleavedAccessType(
           VTy, Factor, Alignment,
           Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
     return false;
 
   IRBuilder<> Builder(Load);
-  Value *WideEVL = Load->getArgOperand(2);
+
+  Value *WideEVL = Load->getVectorLengthParam();
   // Conservatively check if EVL is a multiple of factor, otherwise some
   // (trailing) elements might be lost after the transformation.
   if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
@@ -24005,49 +24196,64 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
       XLenTy);
 
-  static const Intrinsic::ID IntrMaskIds[] = {
-      Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
-      Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
-      Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
-      Intrinsic::riscv_vlseg8_mask,
-  };
+  Value *Return = nullptr;
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    static const Intrinsic::ID FixedMaskedVlsegIntrIds[] = {
+        Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask,
+        Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask,
+        Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
+        Intrinsic::riscv_seg8_load_mask};
+
+    Return = Builder.CreateIntrinsic(FixedMaskedVlsegIntrIds[Factor - 2],
+                                     {FVTy, XLenTy},
+                                     {Load->getArgOperand(0), Mask, EVL});
+  } else {
+    static const Intrinsic::ID IntrMaskIds[] = {
+        Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+        Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+        Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+        Intrinsic::riscv_vlseg8_mask,
+    };
 
-  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-  Type *VecTupTy = TargetExtType::get(
-      Load->getContext(), "riscv.vector.tuple",
-      ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
-                              NumElts * SEW / 8),
-      Factor);
+    unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+    unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+    Type *VecTupTy = TargetExtType::get(
+        Load->getContext(), "riscv.vector.tuple",
+        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
+                                NumElts * SEW / 8),
+        Factor);
 
-  Value *PoisonVal = PoisonValue::get(VecTupTy);
+    Value *PoisonVal = PoisonValue::get(VecTupTy);
 
-  Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-      Load->getModule(), IntrMaskIds[Factor - 2],
-      {VecTupTy, Mask->getType(), EVL->getType()});
+    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+        Load->getModule(), IntrMaskIds[Factor - 2],
+        {VecTupTy, Mask->getType(), EVL->getType()});
 
-  Value *Operands[] = {PoisonVal,
-                       Load->getArgOperand(0),
-                       Mask,
-                       EVL,
-                       ConstantInt::get(XLenTy, RISCVVType::TAIL_AGNOSTIC |
-                                                    RISCVVType::MASK_AGNOSTIC),
-                       ConstantInt::get(XLenTy, Log2_64(SEW))};
+    Value *Operands[] = {
+        PoisonVal,
+        Load->getArgOperand(0),
+        Mask,
+        EVL,
+        ConstantInt::get(XLenTy,
+                         RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
+        ConstantInt::get(XLenTy, Log2_64(SEW))};
 
-  CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
+    CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
 
-  SmallVector<Type *, 8> AggrTypes{Factor, VTy};
-  Value *Return =
-      PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
-  Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
-      Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
-  for (unsigned i = 0; i < Factor; ++i) {
-    Value *VecExtract =
-        Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
-    Return = Builder.CreateInsertValue(Return, VecExtract, i);
+    SmallVector<Type *, 8> AggrTypes{Factor, VTy};
+    Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
+        Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
+    for (unsigned i = 0; i < Factor; ++i) {
+      Value *VecExtract =
+          Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
+      Return = Builder.CreateInsertValue(Return, VecExtract, i);
+    }
   }
 
   for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
+    if (!DIO)
+      continue;
     // We have to create a brand new ExtractValue to replace each
     // of these old ExtractValue instructions.
     Value *NewEV =
@@ -24078,7 +24284,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
 ///                               <vscale x 32 x i8> %load2, ptr %ptr,
 ///                               %mask,
 ///                               i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
+bool RISCVTargetLowering::lowerInterleavedVPStore(
     VPIntrinsic *Store, Value *Mask,
     ArrayRef<Value *> InterleaveOperands) const {
   assert(Mask && "Expect a valid mask");
@@ -24087,8 +24293,7 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
 
   const unsigned Factor = InterleaveOperands.size();
 
-  auto *VTy = dyn_cast<ScalableVectorType>(InterleaveOperands[0]->getType());
-  // TODO: Support fixed vectors.
+  auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
   if (!VTy)
     return false;
 
@@ -24112,6 +24317,20 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
       XLenTy);
 
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    static const Intrinsic::ID FixedMaskedVssegIntrIds[] = {
+        Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+        Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+        Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+        Intrinsic::riscv_seg8_store_mask};
+
+    SmallVector<Value *, 8> Operands(InterleaveOperands);
+    Operands.append({Store->getArgOperand(1), Mask, EVL});
+    Builder.CreateIntrinsic(FixedMaskedVssegIntrIds[Factor - 2], {FVTy, XLenTy},
+                            Operands);
+    return true;
+  }
+
   static const Intrinsic::ID IntrMaskIds[] = {
       Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
       Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ba24a0c..65d433e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -416,7 +416,12 @@ enum NodeType : unsigned {
   RI_VUNZIP2A_VL,
   RI_VUNZIP2B_VL,
 
-  LAST_VL_VECTOR_OP = RI_VUNZIP2B_VL,
+  // zvqdot instructions with additional passthru, mask and VL operands
+  VQDOT_VL,
+  VQDOTU_VL,
+  VQDOTSU_VL,
+
+  LAST_VL_VECTOR_OP = VQDOTSU_VL,
 
   // XRivosVisni
   // VEXTRACT matches the semantics of ri.vextract.x.v. The result is always
@@ -931,13 +936,11 @@ public:
   bool lowerInterleaveIntrinsicToStore(
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
 
-  bool lowerDeinterleavedIntrinsicToVPLoad(
-      VPIntrinsic *Load, Value *Mask,
-      ArrayRef<Value *> DeinterleaveRes) const override;
+  bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                              ArrayRef<Value *> DeinterleaveRes) const override;
 
-  bool lowerInterleavedIntrinsicToVPStore(
-      VPIntrinsic *Store, Value *Mask,
-      ArrayRef<Value *> InterleaveOps) const override;
+  bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
+                               ArrayRef<Value *> InterleaveOps) const override;
 
   bool supportKCFIBundles() const override { return true; }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
index 205fffd..6018958 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
@@ -26,3 +26,34 @@ let Predicates = [HasStdExtZvqdotq] in {
   def VQDOTSU_VX : VALUVX<0b101010, OPMVX, "vqdotsu.vx">;
   def VQDOTUS_VX : VALUVX<0b101110, OPMVX, "vqdotus.vx">;
 } // Predicates = [HasStdExtZvqdotq]
+
+
+def riscv_vqdot_vl : SDNode<"RISCVISD::VQDOT_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_vqdotu_vl : SDNode<"RISCVISD::VQDOTU_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_vqdotsu_vl : SDNode<"RISCVISD::VQDOTSU_VL", SDT_RISCVIntBinOp_VL>;
+
+multiclass VPseudoVQDOT_VV_VX {
+  foreach m = MxSet<32>.m in {
+    defm "" : VPseudoBinaryV_VV<m>,
+            SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
+                        forcePassthruRead=true>;
+    defm "" : VPseudoBinaryV_VX<m>,
+            SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", m.MX,
+                        forcePassthruRead=true>;
+  }
+}
+
+// TODO: Add pseudo and patterns for vqdotus.vx
+// TODO: Add isCommutable for VQDOT and VQDOTU
+let Predicates = [HasStdExtZvqdotq], mayLoad = 0, mayStore = 0,
+    hasSideEffects = 0 in {
+  defm PseudoVQDOT : VPseudoVQDOT_VV_VX;
+  defm PseudoVQDOTU : VPseudoVQDOT_VV_VX;
+  defm PseudoVQDOTSU : VPseudoVQDOT_VV_VX;
+}
+
+defvar AllE32Vectors = [VI32MF2, VI32M1, VI32M2, VI32M4, VI32M8];
+defm : VPatBinaryVL_VV_VX<riscv_vqdot_vl, "PseudoVQDOT", AllE32Vectors>;
+defm : VPatBinaryVL_VV_VX<riscv_vqdotu_vl, "PseudoVQDOTU", AllE32Vectors>;
+defm : VPatBinaryVL_VV_VX<riscv_vqdotsu_vl, "PseudoVQDOTSU", AllE32Vectors>;
+
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 315b504..d11ce46 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -149,6 +149,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVLoadStoreOptPass(*PR);
   initializeRISCVExpandAtomicPseudoPass(*PR);
   initializeRISCVRedundantCopyEliminationPass(*PR);
+  initializeRISCVAsmPrinterPass(*PR);
 }
 
 static StringRef computeDataLayout(const Triple &TT,
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
index 51728d1..1688fa3 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.h
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -36,6 +36,7 @@ createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
                                const RegisterBankInfo &RBI);
 
 void initializeSPIRVModuleAnalysisPass(PassRegistry &);
+void initializeSPIRVAsmPrinterPass(PassRegistry &);
 void initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PassRegistry &);
 void initializeSPIRVPreLegalizerPass(PassRegistry &);
 void initializeSPIRVPreLegalizerCombinerPass(PassRegistry &);
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index f17c8a8..6a75e73 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -50,7 +50,8 @@ class SPIRVAsmPrinter : public AsmPrinter {
 public:
   explicit SPIRVAsmPrinter(TargetMachine &TM,
                            std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), ST(nullptr), TII(nullptr) {}
+      : AsmPrinter(TM, std::move(Streamer), ID), ST(nullptr), TII(nullptr) {}
+  static char ID;
   bool ModuleSectionsEmitted;
   const SPIRVSubtarget *ST;
   const SPIRVInstrInfo *TII;
@@ -635,6 +636,11 @@ bool SPIRVAsmPrinter::doInitialization(Module &M) {
   return AsmPrinter::doInitialization(M);
 }
 
+char SPIRVAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(SPIRVAsmPrinter, "spirv-asm-printer", "SPIRV Assembly Printer",
+                false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVAsmPrinter() {
   RegisterAsmPrinter<SPIRVAsmPrinter> X(getTheSPIRV32Target());
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 22fc1ca..4325023 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1124,8 +1124,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(
       continue;
     Value *OpTyVal = getNormalizedPoisonValue(KnownElemTy);
     Type *OpTy = Op->getType();
-    if (Op->hasUseList() &&
-        (!Ty || AskTy || isUntypedPointerTy(Ty) || isTodoType(Op))) {
+    if (!Ty || AskTy || isUntypedPointerTy(Ty) || isTodoType(Op)) {
       Type *PrevElemTy = GR->findDeducedElementType(Op);
       GR->addDeducedElementType(Op, normalizeType(KnownElemTy));
       // check if KnownElemTy is complete
@@ -1475,36 +1474,34 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
   // Do not emit new spv_ptrcast if equivalent one already exists or when
   // spv_assign_ptr_type already targets this pointer with the same element
   // type.
-  if (Pointer->hasUseList()) {
-    for (auto User : Pointer->users()) {
-      auto *II = dyn_cast<IntrinsicInst>(User);
-      if (!II ||
-          (II->getIntrinsicID() != Intrinsic::spv_assign_ptr_type &&
-           II->getIntrinsicID() != Intrinsic::spv_ptrcast) ||
-          II->getOperand(0) != Pointer)
-        continue;
+  for (auto User : Pointer->users()) {
+    auto *II = dyn_cast<IntrinsicInst>(User);
+    if (!II ||
+        (II->getIntrinsicID() != Intrinsic::spv_assign_ptr_type &&
+         II->getIntrinsicID() != Intrinsic::spv_ptrcast) ||
+        II->getOperand(0) != Pointer)
+      continue;
 
-      // There is some spv_ptrcast/spv_assign_ptr_type already targeting this
-      // pointer.
-      FirstPtrCastOrAssignPtrType = false;
-      if (II->getOperand(1) != VMD ||
-          dyn_cast<ConstantInt>(II->getOperand(2))->getSExtValue() !=
-              AddressSpace)
-        continue;
+    // There is some spv_ptrcast/spv_assign_ptr_type already targeting this
+    // pointer.
+    FirstPtrCastOrAssignPtrType = false;
+    if (II->getOperand(1) != VMD ||
+        dyn_cast<ConstantInt>(II->getOperand(2))->getSExtValue() !=
+            AddressSpace)
+      continue;
 
-      // The spv_ptrcast/spv_assign_ptr_type targeting this pointer is of the
-      // same element type and address space.
-      if (II->getIntrinsicID() != Intrinsic::spv_ptrcast)
-        return;
+    // The spv_ptrcast/spv_assign_ptr_type targeting this pointer is of the same
+    // element type and address space.
+    if (II->getIntrinsicID() != Intrinsic::spv_ptrcast)
+      return;
 
-      // This must be a spv_ptrcast, do not emit new if this one has the same BB
-      // as I. Otherwise, search for other spv_ptrcast/spv_assign_ptr_type.
-      if (II->getParent() != I->getParent())
-        continue;
+    // This must be a spv_ptrcast, do not emit new if this one has the same BB
+    // as I. Otherwise, search for other spv_ptrcast/spv_assign_ptr_type.
+    if (II->getParent() != I->getParent())
+      continue;
 
-      I->setOperand(OperandToReplace, II);
-      return;
-    }
+    I->setOperand(OperandToReplace, II);
+    return;
   }
 
   if (isa<Instruction>(Pointer) || isa<Argument>(Pointer)) {
@@ -2493,13 +2490,10 @@ bool SPIRVEmitIntrinsics::postprocessTypes(Module &M) {
         }
       }
     }
-
-    if (Op->hasUseList()) {
-      for (User *U : Op->users()) {
-        Instruction *Inst = dyn_cast<Instruction>(U);
-        if (Inst && !isa<IntrinsicInst>(Inst))
-          ToProcess[Inst].insert(Op);
-      }
+    for (User *U : Op->users()) {
+      Instruction *Inst = dyn_cast<Instruction>(U);
+      if (Inst && !isa<IntrinsicInst>(Inst))
+        ToProcess[Inst].insert(Op);
     }
   }
   if (TodoTypeSz == 0)
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 88b1e44..ad42c73 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -55,7 +55,6 @@ static unsigned typeToAddressSpace(const Type *Ty) {
   reportFatalInternalError("Unable to convert LLVM type to SPIRVType");
 }
 
-#ifndef NDEBUG
 static bool
 storageClassRequiresExplictLayout(SPIRV::StorageClass::StorageClass SC) {
   switch (SC) {
@@ -87,7 +86,6 @@ storageClassRequiresExplictLayout(SPIRV::StorageClass::StorageClass SC) {
   }
   llvm_unreachable("Unknown SPIRV::StorageClass enum");
 }
-#endif
 
 SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
     : PointerSize(PointerSize), Bound(0) {}
@@ -837,13 +835,31 @@ static std::string buildSpirvTypeName(const SPIRVType *Type,
   }
   case SPIRV::OpTypeStruct: {
     std::string TypeName = "{";
-    for (uint32_t I = 2; I < Type->getNumOperands(); ++I) {
+    for (uint32_t I = 1; I < Type->getNumOperands(); ++I) {
       SPIRVType *MemberType =
           GR.getSPIRVTypeForVReg(Type->getOperand(I).getReg());
-      TypeName = '_' + buildSpirvTypeName(MemberType, MIRBuilder, GR);
+      TypeName += '_' + buildSpirvTypeName(MemberType, MIRBuilder, GR);
     }
     return TypeName + "}";
   }
+  case SPIRV::OpTypeVector: {
+    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+    Register ElementTypeReg = Type->getOperand(1).getReg();
+    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
+    uint32_t VectorSize = GR.getScalarOrVectorComponentCount(Type);
+    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
+            Twine(VectorSize) + Twine("]"))
+        .str();
+  }
+  case SPIRV::OpTypeRuntimeArray: {
+    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+    Register ElementTypeReg = Type->getOperand(1).getReg();
+    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
+    uint32_t ArraySize = 0;
+    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
+            Twine(ArraySize) + Twine("]"))
+        .str();
+  }
   default:
     llvm_unreachable("Trying to the the name of an unknown type.");
   }
@@ -885,30 +901,41 @@ Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding(
   return VarReg;
 }
 
+// TODO: Double check the calls to getOpTypeArray to make sure that `ElemType`
+// is explicitly laid out when required.
 SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems,
                                                SPIRVType *ElemType,
                                                MachineIRBuilder &MIRBuilder,
+                                               bool ExplicitLayoutRequired,
                                                bool EmitIR) {
   assert((ElemType->getOpcode() != SPIRV::OpTypeVoid) &&
          "Invalid array element type");
   SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder);
-
+  SPIRVType *ArrayType = nullptr;
   if (NumElems != 0) {
     Register NumElementsVReg =
         buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR);
-    return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    ArrayType = createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
       return MIRBuilder.buildInstr(SPIRV::OpTypeArray)
           .addDef(createTypeVReg(MIRBuilder))
           .addUse(getSPIRVTypeID(ElemType))
           .addUse(NumElementsVReg);
     });
+  } else {
+    ArrayType = createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+      return MIRBuilder.buildInstr(SPIRV::OpTypeRuntimeArray)
+          .addDef(createTypeVReg(MIRBuilder))
+          .addUse(getSPIRVTypeID(ElemType));
+    });
   }
 
-  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
-    return MIRBuilder.buildInstr(SPIRV::OpTypeRuntimeArray)
-        .addDef(createTypeVReg(MIRBuilder))
-        .addUse(getSPIRVTypeID(ElemType));
-  });
+  if (ExplicitLayoutRequired && !isResourceType(ElemType)) {
+    Type *ET = const_cast<Type *>(getTypeForSPIRVType(ElemType));
+    addArrayStrideDecorations(ArrayType->defs().begin()->getReg(), ET,
+                              MIRBuilder);
+  }
+
+  return ArrayType;
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty,
@@ -926,7 +953,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty,
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(
     const StructType *Ty, MachineIRBuilder &MIRBuilder,
-    SPIRV::AccessQualifier::AccessQualifier AccQual, bool EmitIR) {
+    SPIRV::AccessQualifier::AccessQualifier AccQual,
+    bool ExplicitLayoutRequired, bool EmitIR) {
   SmallVector<Register, 4> FieldTypes;
   constexpr unsigned MaxWordCount = UINT16_MAX;
   const size_t NumElements = Ty->getNumElements();
@@ -940,8 +968,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(
   }
 
   for (const auto &Elem : Ty->elements()) {
-    SPIRVType *ElemTy =
-        findSPIRVType(toTypedPointer(Elem), MIRBuilder, AccQual, EmitIR);
+    SPIRVType *ElemTy = findSPIRVType(toTypedPointer(Elem), MIRBuilder, AccQual,
+                                      ExplicitLayoutRequired, EmitIR);
     assert(ElemTy && ElemTy->getOpcode() != SPIRV::OpTypeVoid &&
            "Invalid struct element type");
     FieldTypes.push_back(getSPIRVTypeID(ElemTy));
@@ -952,18 +980,27 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(
   if (Ty->isPacked())
     buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {});
 
-  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
-    auto MIBStruct = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg);
-    for (size_t I = 0; I < SPIRVStructNumElements; ++I)
-      MIBStruct.addUse(FieldTypes[I]);
-    for (size_t I = SPIRVStructNumElements; I < NumElements;
-         I += MaxNumElements) {
-      auto MIBCont = MIRBuilder.buildInstr(SPIRV::OpTypeStructContinuedINTEL);
-      for (size_t J = I; J < std::min(I + MaxNumElements, NumElements); ++J)
-        MIBCont.addUse(FieldTypes[I]);
-    }
-    return MIBStruct;
-  });
+  SPIRVType *SPVType =
+      createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+        auto MIBStruct =
+            MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg);
+        for (size_t I = 0; I < SPIRVStructNumElements; ++I)
+          MIBStruct.addUse(FieldTypes[I]);
+        for (size_t I = SPIRVStructNumElements; I < NumElements;
+             I += MaxNumElements) {
+          auto MIBCont =
+              MIRBuilder.buildInstr(SPIRV::OpTypeStructContinuedINTEL);
+          for (size_t J = I; J < std::min(I + MaxNumElements, NumElements); ++J)
+            MIBCont.addUse(FieldTypes[I]);
+        }
+        return MIBStruct;
+      });
+
+  if (ExplicitLayoutRequired)
+    addStructOffsetDecorations(SPVType->defs().begin()->getReg(),
+                               const_cast<StructType *>(Ty), MIRBuilder);
+
+  return SPVType;
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSpecialType(
@@ -1013,22 +1050,26 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeFunctionWithArgs(
     const Type *Ty, SPIRVType *RetType,
     const SmallVectorImpl<SPIRVType *> &ArgTypes,
     MachineIRBuilder &MIRBuilder) {
-  if (const MachineInstr *MI = findMI(Ty, &MIRBuilder.getMF()))
+  if (const MachineInstr *MI = findMI(Ty, false, &MIRBuilder.getMF()))
     return MI;
   const MachineInstr *NewMI = getOpTypeFunction(RetType, ArgTypes, MIRBuilder);
-  add(Ty, NewMI);
+  add(Ty, false, NewMI);
   return finishCreatingSPIRVType(Ty, NewMI);
 }
 
 SPIRVType *SPIRVGlobalRegistry::findSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
-    SPIRV::AccessQualifier::AccessQualifier AccQual, bool EmitIR) {
+    SPIRV::AccessQualifier::AccessQualifier AccQual,
+    bool ExplicitLayoutRequired, bool EmitIR) {
   Ty = adjustIntTypeByWidth(Ty);
-  if (const MachineInstr *MI = findMI(Ty, &MIRBuilder.getMF()))
+  // TODO: findMI needs to know if a layout is required.
+  if (const MachineInstr *MI =
+          findMI(Ty, ExplicitLayoutRequired, &MIRBuilder.getMF()))
     return MI;
   if (auto It = ForwardPointerTypes.find(Ty); It != ForwardPointerTypes.end())
     return It->second;
-  return restOfCreateSPIRVType(Ty, MIRBuilder, AccQual, EmitIR);
+  return restOfCreateSPIRVType(Ty, MIRBuilder, AccQual, ExplicitLayoutRequired,
+                               EmitIR);
 }
 
 Register SPIRVGlobalRegistry::getSPIRVTypeID(const SPIRVType *SpirvType) const {
@@ -1062,11 +1103,13 @@ const Type *SPIRVGlobalRegistry::adjustIntTypeByWidth(const Type *Ty) const {
 
 SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
-    SPIRV::AccessQualifier::AccessQualifier AccQual, bool EmitIR) {
+    SPIRV::AccessQualifier::AccessQualifier AccQual,
+    bool ExplicitLayoutRequired, bool EmitIR) {
   if (isSpecialOpaqueType(Ty))
     return getOrCreateSpecialType(Ty, MIRBuilder, AccQual);
 
-  if (const MachineInstr *MI = findMI(Ty, &MIRBuilder.getMF()))
+  if (const MachineInstr *MI =
+          findMI(Ty, ExplicitLayoutRequired, &MIRBuilder.getMF()))
     return MI;
 
   if (auto IType = dyn_cast<IntegerType>(Ty)) {
@@ -1079,27 +1122,31 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
   if (Ty->isVoidTy())
     return getOpTypeVoid(MIRBuilder);
   if (Ty->isVectorTy()) {
-    SPIRVType *El = findSPIRVType(cast<FixedVectorType>(Ty)->getElementType(),
-                                  MIRBuilder, AccQual, EmitIR);
+    SPIRVType *El =
+        findSPIRVType(cast<FixedVectorType>(Ty)->getElementType(), MIRBuilder,
+                      AccQual, ExplicitLayoutRequired, EmitIR);
     return getOpTypeVector(cast<FixedVectorType>(Ty)->getNumElements(), El,
                            MIRBuilder);
   }
   if (Ty->isArrayTy()) {
-    SPIRVType *El =
-        findSPIRVType(Ty->getArrayElementType(), MIRBuilder, AccQual, EmitIR);
-    return getOpTypeArray(Ty->getArrayNumElements(), El, MIRBuilder, EmitIR);
+    SPIRVType *El = findSPIRVType(Ty->getArrayElementType(), MIRBuilder,
+                                  AccQual, ExplicitLayoutRequired, EmitIR);
+    return getOpTypeArray(Ty->getArrayNumElements(), El, MIRBuilder,
+                          ExplicitLayoutRequired, EmitIR);
   }
   if (auto SType = dyn_cast<StructType>(Ty)) {
     if (SType->isOpaque())
       return getOpTypeOpaque(SType, MIRBuilder);
-    return getOpTypeStruct(SType, MIRBuilder, AccQual, EmitIR);
+    return getOpTypeStruct(SType, MIRBuilder, AccQual, ExplicitLayoutRequired,
+                           EmitIR);
   }
   if (auto FType = dyn_cast<FunctionType>(Ty)) {
-    SPIRVType *RetTy =
-        findSPIRVType(FType->getReturnType(), MIRBuilder, AccQual, EmitIR);
+    SPIRVType *RetTy = findSPIRVType(FType->getReturnType(), MIRBuilder,
+                                     AccQual, ExplicitLayoutRequired, EmitIR);
     SmallVector<SPIRVType *, 4> ParamTypes;
     for (const auto &ParamTy : FType->params())
-      ParamTypes.push_back(findSPIRVType(ParamTy, MIRBuilder, AccQual, EmitIR));
+      ParamTypes.push_back(findSPIRVType(ParamTy, MIRBuilder, AccQual,
+                                         ExplicitLayoutRequired, EmitIR));
     return getOpTypeFunction(RetTy, ParamTypes, MIRBuilder);
   }
 
@@ -1114,44 +1161,50 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
   const SPIRVSubtarget *ST =
       static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
   auto SC = addressSpaceToStorageClass(AddrSpace, *ST);
-  // Null pointer means we have a loop in type definitions, make and
-  // return corresponding OpTypeForwardPointer.
-  if (SpvElementType == nullptr) {
-    auto [It, Inserted] = ForwardPointerTypes.try_emplace(Ty);
-    if (Inserted)
-      It->second = getOpTypeForwardPointer(SC, MIRBuilder);
-    return It->second;
+
+  Type *ElemTy = ::getPointeeType(Ty);
+  if (!ElemTy) {
+    ElemTy = Type::getInt8Ty(MIRBuilder.getContext());
   }
+
   // If we have forward pointer associated with this type, use its register
   // operand to create OpTypePointer.
   if (auto It = ForwardPointerTypes.find(Ty); It != ForwardPointerTypes.end()) {
     Register Reg = getSPIRVTypeID(It->second);
+    // TODO: what does getOpTypePointer do?
     return getOpTypePointer(SC, SpvElementType, MIRBuilder, Reg);
   }
 
-  return getOrCreateSPIRVPointerType(SpvElementType, MIRBuilder, SC);
+  return getOrCreateSPIRVPointerType(ElemTy, MIRBuilder, SC);
 }
 
 SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
-    SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
+    SPIRV::AccessQualifier::AccessQualifier AccessQual,
+    bool ExplicitLayoutRequired, bool EmitIR) {
+  // TODO: Could this create a problem if one requires an explicit layout, and
+  // the next time it does not?
   if (TypesInProcessing.count(Ty) && !isPointerTyOrWrapper(Ty))
     return nullptr;
   TypesInProcessing.insert(Ty);
-  SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
+  SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual,
+                                         ExplicitLayoutRequired, EmitIR);
   TypesInProcessing.erase(Ty);
   VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType;
+
+  // TODO: We could end up with two SPIR-V types pointing to the same llvm type.
+  // Is that a problem?
   SPIRVToLLVMType[SpirvType] = unifyPtrType(Ty);
 
   if (SpirvType->getOpcode() == SPIRV::OpTypeForwardPointer ||
-      findMI(Ty, &MIRBuilder.getMF()) || isSpecialOpaqueType(Ty))
+      findMI(Ty, false, &MIRBuilder.getMF()) || isSpecialOpaqueType(Ty))
     return SpirvType;
 
   if (auto *ExtTy = dyn_cast<TargetExtType>(Ty);
       ExtTy && isTypedPointerWrapper(ExtTy))
     add(ExtTy->getTypeParameter(0), ExtTy->getIntParameter(0), SpirvType);
   else if (!isPointerTy(Ty))
-    add(Ty, SpirvType);
+    add(Ty, ExplicitLayoutRequired, SpirvType);
   else if (isTypedPointerTy(Ty))
     add(cast<TypedPointerType>(Ty)->getElementType(),
         getPointerAddressSpace(Ty), SpirvType);
@@ -1183,14 +1236,15 @@ SPIRVType *SPIRVGlobalRegistry::getResultType(Register VReg,
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
-    SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
+    SPIRV::AccessQualifier::AccessQualifier AccessQual,
+    bool ExplicitLayoutRequired, bool EmitIR) {
   const MachineFunction *MF = &MIRBuilder.getMF();
   Register Reg;
   if (auto *ExtTy = dyn_cast<TargetExtType>(Ty);
       ExtTy && isTypedPointerWrapper(ExtTy))
     Reg = find(ExtTy->getTypeParameter(0), ExtTy->getIntParameter(0), MF);
   else if (!isPointerTy(Ty))
-    Reg = find(Ty = adjustIntTypeByWidth(Ty), MF);
+    Reg = find(Ty = adjustIntTypeByWidth(Ty), ExplicitLayoutRequired, MF);
   else if (isTypedPointerTy(Ty))
     Reg = find(cast<TypedPointerType>(Ty)->getElementType(),
                getPointerAddressSpace(Ty), MF);
@@ -1201,15 +1255,20 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
     return getSPIRVTypeForVReg(Reg);
 
   TypesInProcessing.clear();
-  SPIRVType *STy = restOfCreateSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
+  SPIRVType *STy = restOfCreateSPIRVType(Ty, MIRBuilder, AccessQual,
+                                         ExplicitLayoutRequired, EmitIR);
   // Create normal pointer types for the corresponding OpTypeForwardPointers.
   for (auto &CU : ForwardPointerTypes) {
+    // Pointer type themselves do not require an explicit layout. The types
+    // they pointer to might, but that is taken care of when creating the type.
+    bool PtrNeedsLayout = false;
     const Type *Ty2 = CU.first;
     SPIRVType *STy2 = CU.second;
-    if ((Reg = find(Ty2, MF)).isValid())
+    if ((Reg = find(Ty2, PtrNeedsLayout, MF)).isValid())
       STy2 = getSPIRVTypeForVReg(Reg);
     else
-      STy2 = restOfCreateSPIRVType(Ty2, MIRBuilder, AccessQual, EmitIR);
+      STy2 = restOfCreateSPIRVType(Ty2, MIRBuilder, AccessQual, PtrNeedsLayout,
+                                   EmitIR);
     if (Ty == Ty2)
       STy = STy2;
   }
@@ -1238,6 +1297,19 @@ bool SPIRVGlobalRegistry::isScalarOrVectorOfType(Register VReg,
   return false;
 }
 
+bool SPIRVGlobalRegistry::isResourceType(SPIRVType *Type) const {
+  switch (Type->getOpcode()) {
+  case SPIRV::OpTypeImage:
+  case SPIRV::OpTypeSampler:
+  case SPIRV::OpTypeSampledImage:
+    return true;
+  case SPIRV::OpTypeStruct:
+    return hasBlockDecoration(Type);
+  default:
+    return false;
+  }
+  return false;
+}
 unsigned
 SPIRVGlobalRegistry::getScalarOrVectorComponentCount(Register VReg) const {
   return getScalarOrVectorComponentCount(getSPIRVTypeForVReg(VReg));
@@ -1362,16 +1434,16 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateVulkanBufferType(
   if (const MachineInstr *MI = findMI(Key, &MIRBuilder.getMF()))
     return MI;
 
-  // TODO(134119): The SPIRVType for `ElemType` will not have an explicit
-  // layout. This generates invalid SPIR-V.
+  bool ExplicitLayoutRequired = storageClassRequiresExplictLayout(SC);
+  // We need to get the SPIR-V type for the element here, so we can add the
+  // decoration to it.
   auto *T = StructType::create(ElemType);
   auto *BlockType =
-      getOrCreateSPIRVType(T, MIRBuilder, SPIRV::AccessQualifier::None, EmitIr);
+      getOrCreateSPIRVType(T, MIRBuilder, SPIRV::AccessQualifier::None,
+                           ExplicitLayoutRequired, EmitIr);
 
   buildOpDecorate(BlockType->defs().begin()->getReg(), MIRBuilder,
                   SPIRV::Decoration::Block, {});
-  buildOpMemberDecorate(BlockType->defs().begin()->getReg(), MIRBuilder,
-                        SPIRV::Decoration::Offset, 0, {0});
 
   if (!IsWritable) {
     buildOpMemberDecorate(BlockType->defs().begin()->getReg(), MIRBuilder,
@@ -1480,7 +1552,8 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr(
     MachineIRBuilder &MIRBuilder, const TargetExtType *ExtensionType,
     const SPIRVType *ElemType, uint32_t Scope, uint32_t Rows, uint32_t Columns,
     uint32_t Use, bool EmitIR) {
-  if (const MachineInstr *MI = findMI(ExtensionType, &MIRBuilder.getMF()))
+  if (const MachineInstr *MI =
+          findMI(ExtensionType, false, &MIRBuilder.getMF()))
     return MI;
   const MachineInstr *NewMI =
       createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
@@ -1493,26 +1566,26 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr(
             .addUse(buildConstantInt(Columns, MIRBuilder, SpvTypeInt32, EmitIR))
             .addUse(buildConstantInt(Use, MIRBuilder, SpvTypeInt32, EmitIR));
       });
-  add(ExtensionType, NewMI);
+  add(ExtensionType, false, NewMI);
   return NewMI;
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeByOpcode(
     const Type *Ty, MachineIRBuilder &MIRBuilder, unsigned Opcode) {
-  if (const MachineInstr *MI = findMI(Ty, &MIRBuilder.getMF()))
+  if (const MachineInstr *MI = findMI(Ty, false, &MIRBuilder.getMF()))
     return MI;
   const MachineInstr *NewMI =
       createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
         return MIRBuilder.buildInstr(Opcode).addDef(createTypeVReg(MIRBuilder));
       });
-  add(Ty, NewMI);
+  add(Ty, false, NewMI);
   return NewMI;
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateUnknownType(
     const Type *Ty, MachineIRBuilder &MIRBuilder, unsigned Opcode,
     const ArrayRef<MCOperand> Operands) {
-  if (const MachineInstr *MI = findMI(Ty, &MIRBuilder.getMF()))
+  if (const MachineInstr *MI = findMI(Ty, false, &MIRBuilder.getMF()))
     return MI;
   Register ResVReg = createTypeVReg(MIRBuilder);
   const MachineInstr *NewMI =
@@ -1529,7 +1602,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateUnknownType(
         }
         return MIB;
       });
-  add(Ty, NewMI);
+  add(Ty, false, NewMI);
   return NewMI;
 }
 
@@ -1545,7 +1618,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
   if (hasBuiltinTypePrefix(TypeStr))
     return getOrCreateSPIRVType(SPIRV::parseBuiltinTypeNameToTargetExtType(
                                     TypeStr.str(), MIRBuilder.getContext()),
-                                MIRBuilder, AQ, true);
+                                MIRBuilder, AQ, false, true);
 
   // Parse type name in either "typeN" or "type vector[N]" format, where
   // N is the number of elements of the vector.
@@ -1556,7 +1629,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
     // Unable to recognize SPIRV type name
     return nullptr;
 
-  auto SpirvTy = getOrCreateSPIRVType(Ty, MIRBuilder, AQ, true);
+  auto SpirvTy = getOrCreateSPIRVType(Ty, MIRBuilder, AQ, false, true);
 
   // Handle "type*" or  "type* vector[N]".
   if (TypeStr.starts_with("*")) {
@@ -1585,7 +1658,7 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth,
                                                  MachineIRBuilder &MIRBuilder) {
   return getOrCreateSPIRVType(
       IntegerType::get(MIRBuilder.getMF().getFunction().getContext(), BitWidth),
-      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
+      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false, true);
 }
 
 SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy,
@@ -1601,7 +1674,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(unsigned BitWidth,
                                                      const SPIRVInstrInfo &TII,
                                                      unsigned SPIRVOPcode,
                                                      Type *Ty) {
-  if (const MachineInstr *MI = findMI(Ty, CurMF))
+  if (const MachineInstr *MI = findMI(Ty, false, CurMF))
     return MI;
   MachineBasicBlock &DepMBB = I.getMF()->front();
   MachineIRBuilder MIRBuilder(DepMBB, DepMBB.getFirstNonPHI());
@@ -1613,7 +1686,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(unsigned BitWidth,
             .addImm(BitWidth)
             .addImm(0);
       });
-  add(Ty, NewMI);
+  add(Ty, false, NewMI);
   return finishCreatingSPIRVType(Ty, NewMI);
 }
 
@@ -1654,14 +1727,14 @@ SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder,
                                               bool EmitIR) {
   return getOrCreateSPIRVType(
       IntegerType::get(MIRBuilder.getMF().getFunction().getContext(), 1),
-      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR);
+      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false, EmitIR);
 }
 
 SPIRVType *
 SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineInstr &I,
                                               const SPIRVInstrInfo &TII) {
   Type *Ty = IntegerType::get(CurMF->getFunction().getContext(), 1);
-  if (const MachineInstr *MI = findMI(Ty, CurMF))
+  if (const MachineInstr *MI = findMI(Ty, false, CurMF))
     return MI;
   MachineBasicBlock &DepMBB = I.getMF()->front();
   MachineIRBuilder MIRBuilder(DepMBB, DepMBB.getFirstNonPHI());
@@ -1671,7 +1744,7 @@ SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineInstr &I,
                        MIRBuilder.getDL(), TII.get(SPIRV::OpTypeBool))
             .addDef(createTypeVReg(CurMF->getRegInfo()));
       });
-  add(Ty, NewMI);
+  add(Ty, false, NewMI);
   return finishCreatingSPIRVType(Ty, NewMI);
 }
 
@@ -1681,7 +1754,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
   return getOrCreateSPIRVType(
       FixedVectorType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
                            NumElements),
-      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR);
+      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false, EmitIR);
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
@@ -1689,7 +1762,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
     const SPIRVInstrInfo &TII) {
   Type *Ty = FixedVectorType::get(
       const_cast<Type *>(getTypeForSPIRVType(BaseType)), NumElements);
-  if (const MachineInstr *MI = findMI(Ty, CurMF))
+  if (const MachineInstr *MI = findMI(Ty, false, CurMF))
     return MI;
   MachineInstr *DepMI = const_cast<MachineInstr *>(BaseType);
   MachineIRBuilder MIRBuilder(*DepMI->getParent(), DepMI->getIterator());
@@ -1701,30 +1774,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
             .addUse(getSPIRVTypeID(BaseType))
             .addImm(NumElements);
       });
-  add(Ty, NewMI);
-  return finishCreatingSPIRVType(Ty, NewMI);
-}
-
-SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVArrayType(
-    SPIRVType *BaseType, unsigned NumElements, MachineInstr &I,
-    const SPIRVInstrInfo &TII) {
-  Type *Ty = ArrayType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
-                            NumElements);
-  if (const MachineInstr *MI = findMI(Ty, CurMF))
-    return MI;
-  SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, I, TII);
-  Register Len = getOrCreateConstInt(NumElements, I, SpvTypeInt32, TII);
-  MachineBasicBlock &DepMBB = I.getMF()->front();
-  MachineIRBuilder MIRBuilder(DepMBB, getInsertPtValidEnd(&DepMBB));
-  const MachineInstr *NewMI =
-      createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
-        return BuildMI(MIRBuilder.getMBB(), *MIRBuilder.getInsertPt(),
-                       MIRBuilder.getDL(), TII.get(SPIRV::OpTypeArray))
-            .addDef(createTypeVReg(CurMF->getRegInfo()))
-            .addUse(getSPIRVTypeID(BaseType))
-            .addUse(Len);
-      });
-  add(Ty, NewMI);
+  add(Ty, false, NewMI);
   return finishCreatingSPIRVType(Ty, NewMI);
 }
 
@@ -1738,8 +1788,11 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType(
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType(
     const Type *BaseType, MachineIRBuilder &MIRBuilder,
     SPIRV::StorageClass::StorageClass SC) {
+  // TODO: Need to check if EmitIr should always be true.
   SPIRVType *SpirvBaseType = getOrCreateSPIRVType(
-      BaseType, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
+      BaseType, MIRBuilder, SPIRV::AccessQualifier::ReadWrite,
+      storageClassRequiresExplictLayout(SC), true);
+  assert(SpirvBaseType);
   return getOrCreateSPIRVPointerTypeInternal(SpirvBaseType, MIRBuilder, SC);
 }
 
@@ -2006,3 +2059,33 @@ void SPIRVGlobalRegistry::updateAssignType(CallInst *AssignCI, Value *Arg,
   addDeducedElementType(AssignCI, ElemTy);
   addDeducedElementType(Arg, ElemTy);
 }
+
+void SPIRVGlobalRegistry::addStructOffsetDecorations(
+    Register Reg, StructType *Ty, MachineIRBuilder &MIRBuilder) {
+  DataLayout DL;
+  ArrayRef<TypeSize> Offsets = DL.getStructLayout(Ty)->getMemberOffsets();
+  for (uint32_t I = 0; I < Ty->getNumElements(); ++I) {
+    buildOpMemberDecorate(Reg, MIRBuilder, SPIRV::Decoration::Offset, I,
+                          {static_cast<uint32_t>(Offsets[I])});
+  }
+}
+
+void SPIRVGlobalRegistry::addArrayStrideDecorations(
+    Register Reg, Type *ElementType, MachineIRBuilder &MIRBuilder) {
+  uint32_t SizeInBytes = DataLayout().getTypeSizeInBits(ElementType) / 8;
+  buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::ArrayStride,
+                  {SizeInBytes});
+}
+
+bool SPIRVGlobalRegistry::hasBlockDecoration(SPIRVType *Type) const {
+  Register Def = getSPIRVTypeID(Type);
+  for (const MachineInstr &Use :
+       Type->getMF()->getRegInfo().use_instructions(Def)) {
+    if (Use.getOpcode() != SPIRV::OpDecorate)
+      continue;
+
+    if (Use.getOperand(1).getImm() == SPIRV::Decoration::Block)
+      return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index b05896f..7338e80 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -90,14 +90,14 @@ class SPIRVGlobalRegistry : public SPIRVIRMapping {
   // Add a new OpTypeXXX instruction without checking for duplicates.
   SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                              SPIRV::AccessQualifier::AccessQualifier AQ,
-                             bool EmitIR);
+                             bool ExplicitLayoutRequired, bool EmitIR);
   SPIRVType *findSPIRVType(const Type *Ty, MachineIRBuilder &MIRBuilder,
                            SPIRV::AccessQualifier::AccessQualifier accessQual,
-                           bool EmitIR);
+                           bool ExplicitLayoutRequired, bool EmitIR);
   SPIRVType *
   restOfCreateSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                         SPIRV::AccessQualifier::AccessQualifier AccessQual,
-                        bool EmitIR);
+                        bool ExplicitLayoutRequired, bool EmitIR);
 
   // Internal function creating the an OpType at the correct position in the
   // function by tweaking the passed "MIRBuilder" insertion point and restoring
@@ -298,10 +298,19 @@ public:
   // EmitIR controls if we emit GMIR or SPV constants (e.g. for array sizes)
   // because this method may be called from InstructionSelector and we don't
   // want to emit extra IR instructions there.
+  SPIRVType *getOrCreateSPIRVType(const Type *Type, MachineInstr &I,
+                                  SPIRV::AccessQualifier::AccessQualifier AQ,
+                                  bool EmitIR) {
+    MachineIRBuilder MIRBuilder(I);
+    return getOrCreateSPIRVType(Type, MIRBuilder, AQ, EmitIR);
+  }
+
   SPIRVType *getOrCreateSPIRVType(const Type *Type,
                                   MachineIRBuilder &MIRBuilder,
                                   SPIRV::AccessQualifier::AccessQualifier AQ,
-                                  bool EmitIR);
+                                  bool EmitIR) {
+    return getOrCreateSPIRVType(Type, MIRBuilder, AQ, false, EmitIR);
+  }
 
   const Type *getTypeForSPIRVType(const SPIRVType *Ty) const {
     auto Res = SPIRVToLLVMType.find(Ty);
@@ -364,6 +373,10 @@ public:
   // opcode (e.g. OpTypeBool, or OpTypeVector %x 4, where %x is OpTypeBool).
   bool isScalarOrVectorOfType(Register VReg, unsigned TypeOpcode) const;
 
+  // Returns true if `Type` is a resource type. This could be an image type
+  // or a struct for a buffer decorated with the block decoration.
+  bool isResourceType(SPIRVType *Type) const;
+
   // Return number of elements in a vector if the argument is associated with
   // a vector type. Return 1 for a scalar type, and 0 for a missing type.
   unsigned getScalarOrVectorComponentCount(Register VReg) const;
@@ -414,6 +427,11 @@ private:
   const Type *adjustIntTypeByWidth(const Type *Ty) const;
   unsigned adjustOpTypeIntWidth(unsigned Width) const;
 
+  SPIRVType *getOrCreateSPIRVType(const Type *Type,
+                                  MachineIRBuilder &MIRBuilder,
+                                  SPIRV::AccessQualifier::AccessQualifier AQ,
+                                  bool ExplicitLayoutRequired, bool EmitIR);
+
   SPIRVType *getOpTypeInt(unsigned Width, MachineIRBuilder &MIRBuilder,
                           bool IsSigned = false);
 
@@ -425,14 +443,15 @@ private:
                              MachineIRBuilder &MIRBuilder);
 
   SPIRVType *getOpTypeArray(uint32_t NumElems, SPIRVType *ElemType,
-                            MachineIRBuilder &MIRBuilder, bool EmitIR);
+                            MachineIRBuilder &MIRBuilder,
+                            bool ExplicitLayoutRequired, bool EmitIR);
 
   SPIRVType *getOpTypeOpaque(const StructType *Ty,
                              MachineIRBuilder &MIRBuilder);
 
   SPIRVType *getOpTypeStruct(const StructType *Ty, MachineIRBuilder &MIRBuilder,
                              SPIRV::AccessQualifier::AccessQualifier AccQual,
-                             bool EmitIR);
+                             bool ExplicitLayoutRequired, bool EmitIR);
 
   SPIRVType *getOpTypePointer(SPIRV::StorageClass::StorageClass SC,
                               SPIRVType *ElemType, MachineIRBuilder &MIRBuilder,
@@ -475,6 +494,12 @@ private:
                                       MachineIRBuilder &MIRBuilder,
                                       SPIRV::StorageClass::StorageClass SC);
 
+  void addStructOffsetDecorations(Register Reg, StructType *Ty,
+                                  MachineIRBuilder &MIRBuilder);
+  void addArrayStrideDecorations(Register Reg, Type *ElementType,
+                                 MachineIRBuilder &MIRBuilder);
+  bool hasBlockDecoration(SPIRVType *Type) const;
+
 public:
   Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder,
                             SPIRVType *SpvType, bool EmitIR,
@@ -545,9 +570,6 @@ public:
   SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
                                         unsigned NumElements, MachineInstr &I,
                                         const SPIRVInstrInfo &TII);
-  SPIRVType *getOrCreateSPIRVArrayType(SPIRVType *BaseType,
-                                       unsigned NumElements, MachineInstr &I,
-                                       const SPIRVInstrInfo &TII);
 
   // Returns a pointer to a SPIR-V pointer type with the given base type and
   // storage class. The base type will be translated to a SPIR-V type, and the
diff --git a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
index 9c9c099..a329fd5e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
+++ b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
@@ -66,6 +66,7 @@ enum SpecialTypeKind {
   STK_Value,
   STK_MachineInstr,
   STK_VkBuffer,
+  STK_ExplictLayoutType,
   STK_Last = -1
 };
 
@@ -150,6 +151,11 @@ inline IRHandle irhandle_vkbuffer(const Type *ElementType,
                          SpecialTypeKind::STK_VkBuffer);
 }
 
+inline IRHandle irhandle_explict_layout_type(const Type *Ty) {
+  const Type *WrpTy = unifyPtrType(Ty);
+  return irhandle_ptr(WrpTy, Ty->getTypeID(), STK_ExplictLayoutType);
+}
+
 inline IRHandle handle(const Type *Ty) {
   const Type *WrpTy = unifyPtrType(Ty);
   return irhandle_ptr(WrpTy, Ty->getTypeID(), STK_Type);
@@ -163,6 +169,10 @@ inline IRHandle handle(const MachineInstr *KeyMI) {
   return irhandle_ptr(KeyMI, SPIRV::to_hash(KeyMI), STK_MachineInstr);
 }
 
+inline bool type_has_layout_decoration(const Type *T) {
+  return (isa<StructType>(T) || isa<ArrayType>(T));
+}
+
 } // namespace SPIRV
 
 // Bi-directional mappings between LLVM entities and (v-reg, machine function)
@@ -238,14 +248,49 @@ public:
     return findMI(SPIRV::irhandle_pointee(PointeeTy, AddressSpace), MF);
   }
 
-  template <typename T> bool add(const T *Obj, const MachineInstr *MI) {
+  bool add(const Value *V, const MachineInstr *MI) {
+    return add(SPIRV::handle(V), MI);
+  }
+
+  bool add(const Type *T, bool RequiresExplicitLayout, const MachineInstr *MI) {
+    if (RequiresExplicitLayout && SPIRV::type_has_layout_decoration(T)) {
+      return add(SPIRV::irhandle_explict_layout_type(T), MI);
+    }
+    return add(SPIRV::handle(T), MI);
+  }
+
+  bool add(const MachineInstr *Obj, const MachineInstr *MI) {
     return add(SPIRV::handle(Obj), MI);
   }
-  template <typename T> Register find(const T *Obj, const MachineFunction *MF) {
-    return find(SPIRV::handle(Obj), MF);
+
+  Register find(const Value *V, const MachineFunction *MF) {
+    return find(SPIRV::handle(V), MF);
+  }
+
+  Register find(const Type *T, bool RequiresExplicitLayout,
+                const MachineFunction *MF) {
+    if (RequiresExplicitLayout && SPIRV::type_has_layout_decoration(T))
+      return find(SPIRV::irhandle_explict_layout_type(T), MF);
+    return find(SPIRV::handle(T), MF);
+  }
+
+  Register find(const MachineInstr *MI, const MachineFunction *MF) {
+    return find(SPIRV::handle(MI), MF);
+  }
+
+  const MachineInstr *findMI(const Value *Obj, const MachineFunction *MF) {
+    return findMI(SPIRV::handle(Obj), MF);
+  }
+
+  const MachineInstr *findMI(const Type *T, bool RequiresExplicitLayout,
+                             const MachineFunction *MF) {
+    if (RequiresExplicitLayout && SPIRV::type_has_layout_decoration(T))
+      return findMI(SPIRV::irhandle_explict_layout_type(T), MF);
+    return findMI(SPIRV::handle(T), MF);
   }
-  template <typename T>
-  const MachineInstr *findMI(const T *Obj, const MachineFunction *MF) {
+
+  const MachineInstr *findMI(const MachineInstr *Obj,
+                             const MachineFunction *MF) {
     return findMI(SPIRV::handle(Obj), MF);
   }
 };
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 216c3e2..8a87342 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -25,6 +25,42 @@
 
 using namespace llvm;
 
+// Returns true of the types logically match, as defined in
+// https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpCopyLogical.
+static bool typesLogicallyMatch(const SPIRVType *Ty1, const SPIRVType *Ty2,
+                                SPIRVGlobalRegistry &GR) {
+  if (Ty1->getOpcode() != Ty2->getOpcode())
+    return false;
+
+  if (Ty1->getNumOperands() != Ty2->getNumOperands())
+    return false;
+
+  if (Ty1->getOpcode() == SPIRV::OpTypeArray) {
+    // Array must have the same size.
+    if (Ty1->getOperand(2).getReg() != Ty2->getOperand(2).getReg())
+      return false;
+
+    SPIRVType *ElemType1 = GR.getSPIRVTypeForVReg(Ty1->getOperand(1).getReg());
+    SPIRVType *ElemType2 = GR.getSPIRVTypeForVReg(Ty2->getOperand(1).getReg());
+    return ElemType1 == ElemType2 ||
+           typesLogicallyMatch(ElemType1, ElemType2, GR);
+  }
+
+  if (Ty1->getOpcode() == SPIRV::OpTypeStruct) {
+    for (unsigned I = 1; I < Ty1->getNumOperands(); I++) {
+      SPIRVType *ElemType1 =
+          GR.getSPIRVTypeForVReg(Ty1->getOperand(I).getReg());
+      SPIRVType *ElemType2 =
+          GR.getSPIRVTypeForVReg(Ty2->getOperand(I).getReg());
+      if (ElemType1 != ElemType2 &&
+          !typesLogicallyMatch(ElemType1, ElemType2, GR))
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
 unsigned SPIRVTargetLowering::getNumRegistersForCallingConv(
     LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
   // This code avoids CallLowering fail inside getVectorTypeBreakdown
@@ -374,6 +410,9 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const {
         // implies that %Op is a pointer to <ResType>
       case SPIRV::OpLoad:
         // OpLoad <ResType>, ptr %Op implies that %Op is a pointer to <ResType>
+        if (enforcePtrTypeCompatibility(MI, 2, 0))
+          break;
+
         validatePtrTypes(STI, MRI, GR, MI, 2,
                          GR.getSPIRVTypeForVReg(MI.getOperand(0).getReg()));
         break;
@@ -531,3 +570,58 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const {
   ProcessedMF.insert(&MF);
   TargetLowering::finalizeLowering(MF);
 }
+
+// Modifies either operand PtrOpIdx or OpIdx so that the pointee type of
+// PtrOpIdx matches the type for operand OpIdx. Returns true if they already
+// match or if the instruction was modified to make them match.
+bool SPIRVTargetLowering::enforcePtrTypeCompatibility(
+    MachineInstr &I, unsigned int PtrOpIdx, unsigned int OpIdx) const {
+  SPIRVGlobalRegistry &GR = *STI.getSPIRVGlobalRegistry();
+  SPIRVType *PtrType = GR.getResultType(I.getOperand(PtrOpIdx).getReg());
+  SPIRVType *PointeeType = GR.getPointeeType(PtrType);
+  SPIRVType *OpType = GR.getResultType(I.getOperand(OpIdx).getReg());
+
+  if (PointeeType == OpType)
+    return true;
+
+  if (typesLogicallyMatch(PointeeType, OpType, GR)) {
+    // Apply OpCopyLogical to OpIdx.
+    if (I.getOperand(OpIdx).isDef() &&
+        insertLogicalCopyOnResult(I, PointeeType)) {
+      return true;
+    }
+
+    llvm_unreachable("Unable to add OpCopyLogical yet.");
+    return false;
+  }
+
+  return false;
+}
+
+bool SPIRVTargetLowering::insertLogicalCopyOnResult(
+    MachineInstr &I, SPIRVType *NewResultType) const {
+  MachineRegisterInfo *MRI = &I.getMF()->getRegInfo();
+  SPIRVGlobalRegistry &GR = *STI.getSPIRVGlobalRegistry();
+
+  Register NewResultReg =
+      createVirtualRegister(NewResultType, &GR, MRI, *I.getMF());
+  Register NewTypeReg = GR.getSPIRVTypeID(NewResultType);
+
+  assert(std::distance(I.defs().begin(), I.defs().end()) == 1 &&
+         "Expected only one def");
+  MachineOperand &OldResult = *I.defs().begin();
+  Register OldResultReg = OldResult.getReg();
+  MachineOperand &OldType = *I.uses().begin();
+  Register OldTypeReg = OldType.getReg();
+
+  OldResult.setReg(NewResultReg);
+  OldType.setReg(NewTypeReg);
+
+  MachineIRBuilder MIB(*I.getNextNode());
+  return MIB.buildInstr(SPIRV::OpCopyLogical)
+      .addDef(OldResultReg)
+      .addUse(OldTypeReg)
+      .addUse(NewResultReg)
+      .constrainAllUses(*STI.getInstrInfo(), *STI.getRegisterInfo(),
+                        *STI.getRegBankInfo());
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index eb78299..9025e6e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -71,6 +71,11 @@ public:
                                       EVT ConditionVT) const override {
     return ConditionVT.getSimpleVT();
   }
+
+  bool enforcePtrTypeCompatibility(MachineInstr &I, unsigned PtrOpIdx,
+                                   unsigned OpIdx) const;
+  bool insertLogicalCopyOnResult(MachineInstr &I,
+                                 SPIRVType *NewResultType) const;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 2127148..f90b7af 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/Reg2Mem.h"
 #include "llvm/Transforms/Utils.h"
 #include <optional>
@@ -46,6 +47,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeGlobalISel(PR);
   initializeSPIRVModuleAnalysisPass(PR);
+  initializeSPIRVAsmPrinterPass(PR);
   initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PR);
   initializeSPIRVStructurizerPass(PR);
   initializeSPIRVPreLegalizerCombinerPass(PR);
@@ -190,6 +192,12 @@ void SPIRVPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 
   if (TM.getSubtargetImpl()->isVulkanEnv()) {
+    // Vulkan does not allow address space casts. This pass is run to remove
+    // address space casts that can be removed.
+    // If an address space cast is not removed while targeting Vulkan, lowering
+    // will fail during MIR lowering.
+    addPass(createInferAddressSpacesPass());
+
     // 1.  Simplify loop for subsequent transformations. After this steps, loops
     // have the following properties:
     //  - loops have a single entry edge (pre-header to loop header).
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 4bb8d8d1..911c9a1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -48,6 +48,16 @@ public:
       return TTI::PSK_Software; // Arbitrary bit-width INT is not core SPIR-V.
     return TTI::PSK_FastHardware;
   }
+
+  unsigned getFlatAddressSpace() const override {
+    if (ST->isVulkanEnv())
+      return 0;
+    // FIXME: Clang has 2 distinct address space maps. One where
+    // default=4=Generic, and one with default=0=Function. This depends on the
+    // environment. For OpenCL, we don't need to run the InferAddrSpace pass, so
+    // we can return -1, but we might want to fix this.
+    return -1;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Sparc/Sparc.h b/llvm/lib/Target/Sparc/Sparc.h
index 60f20e6..492889e 100644
--- a/llvm/lib/Target/Sparc/Sparc.h
+++ b/llvm/lib/Target/Sparc/Sparc.h
@@ -29,6 +29,7 @@ class SparcTargetMachine;
 FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
 FunctionPass *createSparcDelaySlotFillerPass();
 
+void initializeSparcAsmPrinterPass(PassRegistry &);
 void initializeSparcDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeErrataWorkaroundPass(PassRegistry &);
 } // namespace llvm
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 3cf26ce..a30cf5a 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -47,7 +47,7 @@ class SparcAsmPrinter : public AsmPrinter {
 public:
   explicit SparcAsmPrinter(TargetMachine &TM,
                            std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+      : AsmPrinter(TM, std::move(Streamer), ID) {}
 
   StringRef getPassName() const override { return "Sparc Assembly Printer"; }
 
@@ -73,6 +73,9 @@ public:
 
 private:
   void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+
+public:
+  static char ID;
 };
 } // end of anonymous namespace
 
@@ -503,6 +506,11 @@ bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
+char SparcAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(SparcAsmPrinter, "sparc-asm-printer", "Sparc Assembly Printer",
+                false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmPrinter() {
   RegisterAsmPrinter<SparcAsmPrinter> X(getTheSparcTarget());
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index abbd56a..d4d8cbb 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -28,6 +28,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
   RegisterTargetMachine<SparcelTargetMachine> Z(getTheSparcelTarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeSparcAsmPrinterPass(PR);
   initializeSparcDAGToDAGISelLegacyPass(PR);
   initializeErrataWorkaroundPass(PR);
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index 4d6ec76..a0cf881 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -198,6 +198,7 @@ FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZTDCPass();
 
+void initializeSystemZAsmPrinterPass(PassRegistry &);
 void initializeSystemZCopyPhysRegsPass(PassRegistry &);
 void initializeSystemZDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeSystemZElimComparePass(PassRegistry &);
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index c1ffc28..57911ac 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -1742,6 +1742,11 @@ void SystemZAsmPrinter::emitFunctionEntryLabel() {
   AsmPrinter::emitFunctionEntryLabel();
 }
 
+char SystemZAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(SystemZAsmPrinter, "systemz-asm-printer",
+                "SystemZ Assembly Printer", false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() {
   RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index 47e7f67..cb101e4 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -24,6 +24,9 @@ class Module;
 class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
+public:
+  static char ID;
+
 private:
   MCSymbol *CurrentFnPPA1Sym;     // PPA1 Symbol.
   MCSymbol *CurrentFnEPMarkerSym; // Entry Point Marker.
@@ -97,7 +100,7 @@ private:
 
 public:
   SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), CurrentFnPPA1Sym(nullptr),
+      : AsmPrinter(TM, std::move(Streamer), ID), CurrentFnPPA1Sym(nullptr),
         CurrentFnEPMarkerSym(nullptr), PPA2Sym(nullptr),
         ADATable(TM.getPointerSize(0)) {}
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index f8b0fdc..0f790ca 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -40,6 +40,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget());
   auto &PR = *PassRegistry::getPassRegistry();
+  initializeSystemZAsmPrinterPass(PR);
   initializeSystemZElimComparePass(PR);
   initializeSystemZShortenInstPass(PR);
   initializeSystemZLongBranchPass(PR);
diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h
index ee76c51..ef0484e 100644
--- a/llvm/lib/Target/VE/VE.h
+++ b/llvm/lib/Target/VE/VE.h
@@ -29,6 +29,7 @@ class VETargetMachine;
 
 FunctionPass *createVEISelDag(VETargetMachine &TM);
 FunctionPass *createLVLGenPass();
+void initializeVEAsmPrinterPass(PassRegistry &);
 void initializeVEDAGToDAGISelLegacyPass(PassRegistry &);
 
 void LowerVEMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index 79d5840..ee347cd 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -41,7 +41,7 @@ class VEAsmPrinter : public AsmPrinter {
 
 public:
   explicit VEAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+      : AsmPrinter(TM, std::move(Streamer), ID) {}
 
   StringRef getPassName() const override { return "VE Assembly Printer"; }
 
@@ -62,6 +62,8 @@ public:
                        const char *ExtraCode, raw_ostream &O) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                              const char *ExtraCode, raw_ostream &O) override;
+
+  static char ID;
 };
 } // end of anonymous namespace
 
@@ -419,6 +421,11 @@ bool VEAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
   return false;
 }
 
+char VEAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(VEAsmPrinter, "ve-asm-printer", "VE Assembly Printer", false,
+                false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
   RegisterAsmPrinter<VEAsmPrinter> X(getTheVETarget());
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index a7ee4ff..664a54c 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -30,6 +30,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
   RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeVEAsmPrinterPass(PR);
   initializeVEDAGToDAGISelLegacyPass(PR);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 8f142fa..17481d7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -64,6 +64,7 @@ void initializeOptimizeReturnedPass(PassRegistry &);
 void initializeWebAssemblyRefTypeMem2LocalPass(PassRegistry &);
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
+void initializeWebAssemblyAsmPrinterPass(PassRegistry &);
 void initializeWebAssemblyCleanCodeAfterTrapPass(PassRegistry &);
 void initializeWebAssemblyCFGSortPass(PassRegistry &);
 void initializeWebAssemblyCFGStackifyPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 22803ff..c61ed3c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -19,6 +19,7 @@
 #include "MCTargetDesc/WebAssemblyTargetStreamer.h"
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "Utils/WebAssemblyTypeUtilities.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMCInstLower.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyRegisterInfo.h"
@@ -752,6 +753,11 @@ bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, OS);
 }
 
+char WebAssemblyAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(WebAssemblyAsmPrinter, "webassembly-asm-printer",
+                "WebAssembly Assmebly Printer", false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmPrinter() {
   RegisterAsmPrinter<WebAssemblyAsmPrinter> X(getTheWebAssemblyTarget32());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index 6a544ab..46063bb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -19,6 +19,10 @@ namespace llvm {
 class WebAssemblyTargetStreamer;
 
 class LLVM_LIBRARY_VISIBILITY WebAssemblyAsmPrinter final : public AsmPrinter {
+public:
+  static char ID;
+
+private:
   const WebAssemblySubtarget *Subtarget;
   const MachineRegisterInfo *MRI;
   WebAssemblyFunctionInfo *MFI;
@@ -27,8 +31,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyAsmPrinter final : public AsmPrinter {
 public:
   explicit WebAssemblyAsmPrinter(TargetMachine &TM,
                                  std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr), MRI(nullptr),
-        MFI(nullptr) {}
+      : AsmPrinter(TM, std::move(Streamer), ID), Subtarget(nullptr),
+        MRI(nullptr), MFI(nullptr) {}
 
   StringRef getPassName() const override {
     return "WebAssembly Assembly Printer";
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index cc96d7b..adb446b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -69,6 +69,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   initializeOptimizeReturnedPass(PR);
   initializeWebAssemblyRefTypeMem2LocalPass(PR);
   initializeWebAssemblyArgumentMovePass(PR);
+  initializeWebAssemblyAsmPrinterPass(PR);
   initializeWebAssemblySetP2AlignOperandsPass(PR);
   initializeWebAssemblyReplacePhysRegsPass(PR);
   initializeWebAssemblyOptimizeLiveIntervalsPass(PR);
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index e6c0612..ba53ffd 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -176,6 +176,7 @@ void initializeFPSPass(PassRegistry &);
 void initializeFixupBWInstPassPass(PassRegistry &);
 void initializeFixupLEAPassPass(PassRegistry &);
 void initializeX86ArgumentStackSlotPassPass(PassRegistry &);
+void initializeX86AsmPrinterPass(PassRegistry &);
 void initializeX86FixupInstTuningPassPass(PassRegistry &);
 void initializeX86FixupVectorConstantsPassPass(PassRegistry &);
 void initializeWinEHStatePassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 29ec14e..5f5bfc7 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86TargetStreamer.h"
 #include "TargetInfo/X86TargetInfo.h"
+#include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
@@ -53,7 +54,7 @@ using namespace llvm;
 
 X86AsmPrinter::X86AsmPrinter(TargetMachine &TM,
                              std::unique_ptr<MCStreamer> Streamer)
-    : AsmPrinter(TM, std::move(Streamer)), FM(*this) {}
+    : AsmPrinter(TM, std::move(Streamer), ID), FM(*this) {}
 
 //===----------------------------------------------------------------------===//
 // Primitive Helper Functions.
@@ -1086,6 +1087,11 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
   }
 }
 
+char X86AsmPrinter::ID = 0;
+
+INITIALIZE_PASS(X86AsmPrinter, "x86-asm-printer", "X86 Assembly Printer", false,
+                false)
+
 //===----------------------------------------------------------------------===//
 // Target Registry Stuff
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index 8dd7fa4..61d8f45 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -25,6 +25,10 @@ class X86Subtarget;
 class TargetMachine;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+public:
+  static char ID;
+
+private:
   const X86Subtarget *Subtarget = nullptr;
   FaultMaps FM;
   std::unique_ptr<MCCodeEmitter> CodeEmitter;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 975b94c..5fff9c3 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -102,6 +102,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
   initializeX86ReturnThunksPass(PR);
   initializeX86DAGToDAGISelLegacyPass(PR);
   initializeX86ArgumentStackSlotPassPass(PR);
+  initializeX86AsmPrinterPass(PR);
   initializeX86FixupInstTuningPassPass(PR);
   initializeX86FixupVectorConstantsPassPass(PR);
   initializeX86DynAllocaExpanderPass(PR);
diff --git a/llvm/lib/Target/XCore/CMakeLists.txt b/llvm/lib/Target/XCore/CMakeLists.txt
index 447f5c5..f411c65 100644
--- a/llvm/lib/Target/XCore/CMakeLists.txt
+++ b/llvm/lib/Target/XCore/CMakeLists.txt
@@ -8,6 +8,7 @@ tablegen(LLVM XCoreGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM XCoreGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM XCoreGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM XCoreGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM XCoreGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM XCoreGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(XCoreCommonTableGen)
diff --git a/llvm/lib/Target/XCore/XCore.h b/llvm/lib/Target/XCore/XCore.h
index ad50f05..213bdbf 100644
--- a/llvm/lib/Target/XCore/XCore.h
+++ b/llvm/lib/Target/XCore/XCore.h
@@ -31,6 +31,7 @@ namespace llvm {
   FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
                                    CodeGenOptLevel OptLevel);
   ModulePass *createXCoreLowerThreadLocalPass();
+  void initializeXCoreAsmPrinterPass(PassRegistry &);
   void initializeXCoreDAGToDAGISelLegacyPass(PassRegistry &);
 
 } // end namespace llvm;
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index a1f7608..2aaeecf 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -49,9 +49,11 @@ namespace {
     XCoreTargetStreamer &getTargetStreamer();
 
   public:
+    static char ID;
+
     explicit XCoreAsmPrinter(TargetMachine &TM,
                              std::unique_ptr<MCStreamer> Streamer)
-        : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(*this) {}
+        : AsmPrinter(TM, std::move(Streamer), ID), MCInstLowering(*this) {}
 
     StringRef getPassName() const override { return "XCore Assembly Printer"; }
 
@@ -288,6 +290,11 @@ void XCoreAsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+char XCoreAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(XCoreAsmPrinter, "xcore-asm-printer", "XCore Assembly Printer",
+                false, false)
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreAsmPrinter() {
   RegisterAsmPrinter<XCoreAsmPrinter> X(getTheXCoreTarget());
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index ac19923..1c6e294 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -40,33 +40,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "xcore-lower"
 
-const char *XCoreTargetLowering::
-getTargetNodeName(unsigned Opcode) const
-{
-  switch ((XCoreISD::NodeType)Opcode)
-  {
-    case XCoreISD::FIRST_NUMBER      : break;
-    case XCoreISD::BL                : return "XCoreISD::BL";
-    case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
-    case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
-    case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
-    case XCoreISD::LDWSP             : return "XCoreISD::LDWSP";
-    case XCoreISD::STWSP             : return "XCoreISD::STWSP";
-    case XCoreISD::RETSP             : return "XCoreISD::RETSP";
-    case XCoreISD::LADD              : return "XCoreISD::LADD";
-    case XCoreISD::LSUB              : return "XCoreISD::LSUB";
-    case XCoreISD::LMUL              : return "XCoreISD::LMUL";
-    case XCoreISD::MACCU             : return "XCoreISD::MACCU";
-    case XCoreISD::MACCS             : return "XCoreISD::MACCS";
-    case XCoreISD::CRC8              : return "XCoreISD::CRC8";
-    case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
-    case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
-    case XCoreISD::FRAME_TO_ARGS_OFFSET : return "XCoreISD::FRAME_TO_ARGS_OFFSET";
-    case XCoreISD::EH_RETURN         : return "XCoreISD::EH_RETURN";
-  }
-  return nullptr;
-}
-
 XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
                                          const XCoreSubtarget &Subtarget)
     : TargetLowering(TM), TM(TM), Subtarget(Subtarget) {
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.h b/llvm/lib/Target/XCore/XCoreISelLowering.h
index 1e036ea..bad6588 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.h
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -23,65 +23,6 @@ namespace llvm {
   // Forward delcarations
   class XCoreSubtarget;
 
-  namespace XCoreISD {
-    enum NodeType : unsigned {
-      // Start the numbering where the builtin ops and target ops leave off.
-      FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-      // Branch and link (call)
-      BL,
-
-      // pc relative address
-      PCRelativeWrapper,
-
-      // dp relative address
-      DPRelativeWrapper,
-
-      // cp relative address
-      CPRelativeWrapper,
-
-      // Load word from stack
-      LDWSP,
-
-      // Store word to stack
-      STWSP,
-
-      // Corresponds to retsp instruction
-      RETSP,
-
-      // Corresponds to LADD instruction
-      LADD,
-
-      // Corresponds to LSUB instruction
-      LSUB,
-
-      // Corresponds to LMUL instruction
-      LMUL,
-
-      // Corresponds to MACCU instruction
-      MACCU,
-
-      // Corresponds to MACCS instruction
-      MACCS,
-
-      // Corresponds to CRC8 instruction
-      CRC8,
-
-      // Jumptable branch.
-      BR_JT,
-
-      // Jumptable branch using long branches for each entry.
-      BR_JT32,
-
-      // Offset from frame pointer to the first (possible) on-stack argument
-      FRAME_TO_ARGS_OFFSET,
-
-      // Exception handler return. The stack is restored to the first
-      // followed by a jump to the second argument.
-      EH_RETURN,
-    };
-  }
-
   //===--------------------------------------------------------------------===//
   // TargetLowering Implementation
   //===--------------------------------------------------------------------===//
@@ -109,10 +50,6 @@ namespace llvm {
     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                             SelectionDAG &DAG) const override;
 
-    /// getTargetNodeName - This method returns the name of a target specific
-    //  DAG node.
-    const char *getTargetNodeName(unsigned Opcode) const override;
-
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
diff --git a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 0d09707..bc34ab4 100644
--- a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -10,11 +10,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "XCoreSelectionDAGInfo.h"
 #include "XCoreTargetMachine.h"
+
+#define GET_SDNODE_DESC
+#include "XCoreGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "xcore-selectiondag-info"
 
+XCoreSelectionDAGInfo::XCoreSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(XCoreGenSDNodeInfo) {}
+
 SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
diff --git a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 2abf526..4a28482 100644
--- a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -15,10 +15,15 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "XCoreGenSDNodeInfo.inc"
+
 namespace llvm {
 
-class XCoreSelectionDAGInfo : public SelectionDAGTargetInfo {
+class XCoreSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  XCoreSelectionDAGInfo();
+
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   SDValue Op3, Align Alignment, bool isVolatile,
@@ -27,6 +32,6 @@ public:
                                   MachinePointerInfo SrcPtrInfo) const override;
 };
 
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 4672174..3627b81a 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -105,6 +105,7 @@ void XCorePassConfig::addPreEmitPass() {
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeXCoreAsmPrinterPass(PR);
   initializeXCoreDAGToDAGISelLegacyPass(PR);
   initializeXCoreLowerThreadLocalPass(PR);
 }
diff --git a/llvm/lib/Target/Xtensa/Xtensa.h b/llvm/lib/Target/Xtensa/Xtensa.h
index da44e30..4d57bfd 100644
--- a/llvm/lib/Target/Xtensa/Xtensa.h
+++ b/llvm/lib/Target/Xtensa/Xtensa.h
@@ -19,10 +19,12 @@
 #include "llvm/Support/CodeGen.h"
 
 namespace llvm {
-class XtensaTargetMachine;
 class FunctionPass;
+class PassRegistry;
+class XtensaTargetMachine;
 
 FunctionPass *createXtensaISelDag(XtensaTargetMachine &TM,
                                   CodeGenOptLevel OptLevel);
+void initializeXtensaAsmPrinterPass(PassRegistry &);
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_XTENSA_XTENSA_H
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index a622ea2..9182ea2 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -312,6 +312,11 @@ void XtensaAsmPrinter::lowerToMCInst(const MachineInstr *MI,
   }
 }
 
+char XtensaAsmPrinter::ID = 0;
+
+INITIALIZE_PASS(XtensaAsmPrinter, "xtensa-asm-printer",
+                "Xtensa Assembly Printer", false, false)
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXtensaAsmPrinter() {
   RegisterAsmPrinter<XtensaAsmPrinter> A(getTheXtensaTarget());
 }
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
index 1137309..5ab2f79 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h
@@ -29,9 +29,11 @@ class LLVM_LIBRARY_VISIBILITY XtensaAsmPrinter : public AsmPrinter {
   const MCSubtargetInfo *STI;
 
 public:
+  static char ID;
+
   explicit XtensaAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), STI(TM.getMCSubtargetInfo()) {}
+      : AsmPrinter(TM, std::move(Streamer), ID), STI(TM.getMCSubtargetInfo()) {}
 
   StringRef getPassName() const override { return "Xtensa Assembly Printer"; }
   void emitInstruction(const MachineInstr *MI) override;
diff --git a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
index 634ada3..8d2dca6 100644
--- a/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaTargetMachine.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>
 
@@ -27,6 +28,8 @@ using namespace llvm;
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXtensaTarget() {
   // Register the target.
   RegisterTargetMachine<XtensaTargetMachine> A(getTheXtensaTarget());
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeXtensaAsmPrinterPass(PR);
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 74363f8..6a559ff 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -299,6 +299,8 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case Linux: return "linux";
   case Lv2: return "lv2";
   case MacOSX: return "macosx";
+  case Managarm:
+    return "managarm";
   case Mesa3D: return "mesa3d";
   case NVCL: return "nvcl";
   case NaCl: return "nacl";
@@ -384,6 +386,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
     return "pauthtest";
   case LLVM:
     return "llvm";
+  case Mlibc:
+    return "mlibc";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -678,6 +682,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("linux", Triple::Linux)
     .StartsWith("lv2", Triple::Lv2)
     .StartsWith("macos", Triple::MacOSX)
+    .StartsWith("managarm", Triple::Managarm)
     .StartsWith("netbsd", Triple::NetBSD)
     .StartsWith("openbsd", Triple::OpenBSD)
     .StartsWith("solaris", Triple::Solaris)
@@ -766,6 +771,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
       .StartsWith("ohos", Triple::OpenHOS)
       .StartsWith("pauthtest", Triple::PAuthTest)
       .StartsWith("llvm", Triple::LLVM)
+      .StartsWith("mlibc", Triple::Mlibc)
       .Default(Triple::UnknownEnvironment);
 }
 
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index 40164a3..5a87cf8 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -231,45 +231,55 @@ PreservedAnalyses
 }
 
 static constexpr std::pair<StringLiteral, StringLiteral> ReplaceMap[]{
-  {"aligned_alloc",             "__hipstdpar_aligned_alloc"},
-  {"calloc",                    "__hipstdpar_calloc"},
-  {"free",                      "__hipstdpar_free"},
-  {"malloc",                    "__hipstdpar_malloc"},
-  {"memalign",                  "__hipstdpar_aligned_alloc"},
-  {"posix_memalign",            "__hipstdpar_posix_aligned_alloc"},
-  {"realloc",                   "__hipstdpar_realloc"},
-  {"reallocarray",              "__hipstdpar_realloc_array"},
-  {"_ZdaPv",                    "__hipstdpar_operator_delete"},
-  {"_ZdaPvm",                   "__hipstdpar_operator_delete_sized"},
-  {"_ZdaPvSt11align_val_t",     "__hipstdpar_operator_delete_aligned"},
-  {"_ZdaPvmSt11align_val_t",    "__hipstdpar_operator_delete_aligned_sized"},
-  {"_ZdlPv",                    "__hipstdpar_operator_delete"},
-  {"_ZdlPvm",                   "__hipstdpar_operator_delete_sized"},
-  {"_ZdlPvSt11align_val_t",     "__hipstdpar_operator_delete_aligned"},
-  {"_ZdlPvmSt11align_val_t",    "__hipstdpar_operator_delete_aligned_sized"},
-  {"_Znam",                     "__hipstdpar_operator_new"},
-  {"_ZnamRKSt9nothrow_t",       "__hipstdpar_operator_new_nothrow"},
-  {"_ZnamSt11align_val_t",      "__hipstdpar_operator_new_aligned"},
-  {"_ZnamSt11align_val_tRKSt9nothrow_t",
-                                "__hipstdpar_operator_new_aligned_nothrow"},
-
-  {"_Znwm",                     "__hipstdpar_operator_new"},
-  {"_ZnwmRKSt9nothrow_t",       "__hipstdpar_operator_new_nothrow"},
-  {"_ZnwmSt11align_val_t",      "__hipstdpar_operator_new_aligned"},
-  {"_ZnwmSt11align_val_tRKSt9nothrow_t",
-                                "__hipstdpar_operator_new_aligned_nothrow"},
-  {"__builtin_calloc",          "__hipstdpar_calloc"},
-  {"__builtin_free",            "__hipstdpar_free"},
-  {"__builtin_malloc",          "__hipstdpar_malloc"},
-  {"__builtin_operator_delete", "__hipstdpar_operator_delete"},
-  {"__builtin_operator_new",    "__hipstdpar_operator_new"},
-  {"__builtin_realloc",         "__hipstdpar_realloc"},
-  {"__libc_calloc",             "__hipstdpar_calloc"},
-  {"__libc_free",               "__hipstdpar_free"},
-  {"__libc_malloc",             "__hipstdpar_malloc"},
-  {"__libc_memalign",           "__hipstdpar_aligned_alloc"},
-  {"__libc_realloc",            "__hipstdpar_realloc"}
-};
+    {"aligned_alloc", "__hipstdpar_aligned_alloc"},
+    {"calloc", "__hipstdpar_calloc"},
+    {"free", "__hipstdpar_free"},
+    {"malloc", "__hipstdpar_malloc"},
+    {"memalign", "__hipstdpar_aligned_alloc"},
+    {"mmap", "__hipstdpar_mmap"},
+    {"munmap", "__hipstdpar_munmap"},
+    {"posix_memalign", "__hipstdpar_posix_aligned_alloc"},
+    {"realloc", "__hipstdpar_realloc"},
+    {"reallocarray", "__hipstdpar_realloc_array"},
+    {"_ZdaPv", "__hipstdpar_operator_delete"},
+    {"_ZdaPvm", "__hipstdpar_operator_delete_sized"},
+    {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
+    {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
+    {"_ZdlPv", "__hipstdpar_operator_delete"},
+    {"_ZdlPvm", "__hipstdpar_operator_delete_sized"},
+    {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
+    {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
+    {"_Znam", "__hipstdpar_operator_new"},
+    {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
+    {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"},
+    {"_ZnamSt11align_val_tRKSt9nothrow_t",
+     "__hipstdpar_operator_new_aligned_nothrow"},
+
+    {"_Znwm", "__hipstdpar_operator_new"},
+    {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
+    {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"},
+    {"_ZnwmSt11align_val_tRKSt9nothrow_t",
+     "__hipstdpar_operator_new_aligned_nothrow"},
+    {"__builtin_calloc", "__hipstdpar_calloc"},
+    {"__builtin_free", "__hipstdpar_free"},
+    {"__builtin_malloc", "__hipstdpar_malloc"},
+    {"__builtin_operator_delete", "__hipstdpar_operator_delete"},
+    {"__builtin_operator_new", "__hipstdpar_operator_new"},
+    {"__builtin_realloc", "__hipstdpar_realloc"},
+    {"__libc_calloc", "__hipstdpar_calloc"},
+    {"__libc_free", "__hipstdpar_free"},
+    {"__libc_malloc", "__hipstdpar_malloc"},
+    {"__libc_memalign", "__hipstdpar_aligned_alloc"},
+    {"__libc_realloc", "__hipstdpar_realloc"}};
+
+static constexpr std::pair<StringLiteral, StringLiteral> HiddenMap[]{
+    // hidden_malloc and hidden_free are only kept for backwards compatibility /
+    // legacy purposes, and we should remove them in the future
+    {"__hipstdpar_hidden_malloc", "__libc_malloc"},
+    {"__hipstdpar_hidden_free", "__libc_free"},
+    {"__hipstdpar_hidden_memalign", "__libc_memalign"},
+    {"__hipstdpar_hidden_mmap", "mmap"},
+    {"__hipstdpar_hidden_munmap", "munmap"}};
 
 PreservedAnalyses
 HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
@@ -299,19 +309,14 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
     }
   }
 
-  if (auto F = M.getFunction("__hipstdpar_hidden_malloc")) {
-    auto LibcMalloc = M.getOrInsertFunction(
-        "__libc_malloc", F->getFunctionType(), F->getAttributes());
-    F->replaceAllUsesWith(LibcMalloc.getCallee());
+  for (auto &&HR : HiddenMap) {
+    if (auto F = M.getFunction(HR.first)) {
+      auto R = M.getOrInsertFunction(HR.second, F->getFunctionType(),
+                                     F->getAttributes());
+      F->replaceAllUsesWith(R.getCallee());
 
-    eraseFromModule(*F);
-  }
-  if (auto F = M.getFunction("__hipstdpar_hidden_free")) {
-    auto LibcFree = M.getOrInsertFunction("__libc_free", F->getFunctionType(),
-                                          F->getAttributes());
-    F->replaceAllUsesWith(LibcFree.getCallee());
-
-    eraseFromModule(*F);
+      eraseFromModule(*F);
+    }
   }
 
   return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 40881fd..e432f0c 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3285,7 +3285,7 @@ InformationCache::FunctionInfo::~FunctionInfo() {
     It.getSecond()->~InstructionVectorTy();
 }
 
-const ArrayRef<Function *>
+ArrayRef<Function *>
 InformationCache::getIndirectlyCallableFunctions(Attributor &A) const {
   assert(A.isClosedWorldModule() && "Cannot see all indirect callees!");
   return IndirectlyCallableFunctions;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3e78b20..3d35bf7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2300,6 +2300,18 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         return BitOp;
     }
 
+    // fshl(X, X, Neg(Y)) --> fshr(X, X, Y)
+    // fshr(X, X, Neg(Y)) --> fshl(X, X, Y)
+    // if BitWidth is a power-of-2
+    Value *Y;
+    if (Op0 == Op1 && isPowerOf2_32(BitWidth) &&
+        match(II->getArgOperand(2), m_Neg(m_Value(Y)))) {
+      Module *Mod = II->getModule();
+      Function *OppositeShift = Intrinsic::getOrInsertDeclaration(
+          Mod, IID == Intrinsic::fshl ? Intrinsic::fshr : Intrinsic::fshl, Ty);
+      return CallInst::Create(OppositeShift, {Op0, Op1, Y});
+    }
+
     // fshl(X, 0, Y) --> shl(X, and(Y, BitWidth - 1)) if bitwidth is a
     // power-of-2
     if (IID == Intrinsic::fshl && isPowerOf2_32(BitWidth) &&
@@ -3799,6 +3811,21 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::frexp: {
+    Value *X;
+    // The first result is idempotent with the added complication of the struct
+    // return, and the second result is zero because the value is already
+    // normalized.
+    if (match(II->getArgOperand(0), m_ExtractValue<0>(m_Value(X)))) {
+      if (match(X, m_Intrinsic<Intrinsic::frexp>(m_Value()))) {
+        X = Builder.CreateInsertValue(
+            X, Constant::getNullValue(II->getType()->getStructElementType(1)),
+            1);
+        return replaceInstUsesWith(*II, X);
+      }
+    }
+    break;
+  }
   default: {
     // Handle target specific intrinsics
     std::optional<Instruction *> V = targetInstCombineIntrinsic(*II);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index b5c1ee0..a48854a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1894,14 +1894,9 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
         // Try to use shuffle-of-operand in place of an operand:
         // bo X, Y --> bo (shuf X), Y
         // bo X, Y --> bo X, (shuf Y)
-
-        Value *OtherOp = MatchShufAsOp0 ? Y : X;
-        if (!OtherOp->hasUseList())
-          return nullptr;
-
         BinaryOperator::BinaryOps Opcode = BO->getOpcode();
         Value *ShufOp = MatchShufAsOp0 ? X : Y;
-
+        Value *OtherOp = MatchShufAsOp0 ? Y : X;
         for (User *U : OtherOp->users()) {
           ArrayRef<int> Mask;
           auto Shuf = m_Shuffle(m_Specific(ShufOp), m_Value(), m_Mask(Mask));
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 206d41e..d30c609 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1831,7 +1831,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
     // Handle some cases that can't be fully simplified, but where we know that
     // the two instructions will fold into one.
     auto WillFold = [&]() {
-      if (!InVal->hasUseList() || !InVal->hasOneUser())
+      if (!InVal->hasOneUser())
         return false;
 
       // icmp of ucmp/scmp with constant will fold to icmp.
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index e522696..9c6660d 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -86,6 +86,7 @@ const char SanCovPCsSectionName[] = "sancov_pcs";
 const char SanCovCFsSectionName[] = "sancov_cfs";
 const char SanCovCallbackGateSectionName[] = "sancov_gate";
 
+const char SanCovStackDepthCallbackName[] = "__sanitizer_cov_stack_depth";
 const char SanCovLowestStackName[] = "__sancov_lowest_stack";
 const char SanCovCallbackGateName[] = "__sancov_should_track";
 
@@ -118,6 +119,11 @@ static cl::opt<bool>
                          cl::Hidden);
 
 static cl::opt<bool>
+    ClSancovDropCtors("sanitizer-coverage-drop-ctors",
+                      cl::desc("do not emit module ctors for global counters"),
+                      cl::Hidden);
+
+static cl::opt<bool>
     ClInlineBoolFlag("sanitizer-coverage-inline-bool-flag",
                      cl::desc("sets a boolean flag for every edge"),
                      cl::Hidden);
@@ -152,6 +158,12 @@ static cl::opt<bool> ClStackDepth("sanitizer-coverage-stack-depth",
                                   cl::desc("max stack depth tracing"),
                                   cl::Hidden);
 
+static cl::opt<int> ClStackDepthCallbackMin(
+    "sanitizer-coverage-stack-depth-callback-min",
+    cl::desc("max stack depth tracing should use callback and only when "
+             "stack depth more than specified"),
+    cl::Hidden);
+
 static cl::opt<bool>
     ClCollectCF("sanitizer-coverage-control-flow",
                 cl::desc("collect control flow for each function"), cl::Hidden);
@@ -202,6 +214,8 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.PCTable |= ClCreatePCTable;
   Options.NoPrune |= !ClPruneBlocks;
   Options.StackDepth |= ClStackDepth;
+  Options.StackDepthCallbackMin = std::max(Options.StackDepthCallbackMin,
+                                           ClStackDepthCallbackMin.getValue());
   Options.TraceLoads |= ClLoadTracing;
   Options.TraceStores |= ClStoreTracing;
   Options.GatedCallbacks |= ClGatedCallbacks;
@@ -271,6 +285,7 @@ private:
   DomTreeCallback DTCallback;
   PostDomTreeCallback PDTCallback;
 
+  FunctionCallee SanCovStackDepthCallback;
   FunctionCallee SanCovTracePCIndir;
   FunctionCallee SanCovTracePC, SanCovTracePCGuard;
   std::array<FunctionCallee, 4> SanCovTraceCmpFunction;
@@ -288,11 +303,11 @@ private:
   LLVMContext *C;
   const DataLayout *DL;
 
-  GlobalVariable *FunctionGuardArray;  // for trace-pc-guard.
-  GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
-  GlobalVariable *FunctionBoolArray;         // for inline-bool-flag.
-  GlobalVariable *FunctionPCsArray;  // for pc-table.
-  GlobalVariable *FunctionCFsArray;  // for control flow table
+  GlobalVariable *FunctionGuardArray;       // for trace-pc-guard.
+  GlobalVariable *Function8bitCounterArray; // for inline-8bit-counters.
+  GlobalVariable *FunctionBoolArray;        // for inline-bool-flag.
+  GlobalVariable *FunctionPCsArray;         // for pc-table.
+  GlobalVariable *FunctionCFsArray;         // for control flow table
   SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
   SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
 
@@ -336,13 +351,11 @@ ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
   GlobalValue::LinkageTypes Linkage = TargetTriple.isOSBinFormatCOFF()
                                           ? GlobalVariable::ExternalLinkage
                                           : GlobalVariable::ExternalWeakLinkage;
-  GlobalVariable *SecStart =
-      new GlobalVariable(M, Ty, false, Linkage, nullptr,
-                         getSectionStart(Section));
+  GlobalVariable *SecStart = new GlobalVariable(M, Ty, false, Linkage, nullptr,
+                                                getSectionStart(Section));
   SecStart->setVisibility(GlobalValue::HiddenVisibility);
-  GlobalVariable *SecEnd =
-      new GlobalVariable(M, Ty, false, Linkage, nullptr,
-                         getSectionEnd(Section));
+  GlobalVariable *SecEnd = new GlobalVariable(M, Ty, false, Linkage, nullptr,
+                                              getSectionEnd(Section));
   SecEnd->setVisibility(GlobalValue::HiddenVisibility);
   IRBuilder<> IRB(M.getContext());
   if (!TargetTriple.isOSBinFormatCOFF())
@@ -358,6 +371,8 @@ ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
 Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
     Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty,
     const char *Section) {
+  if (ClSancovDropCtors)
+    return nullptr;
   auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
   auto SecStart = SecStartEnd.first;
   auto SecEnd = SecStartEnd.second;
@@ -447,25 +462,16 @@ bool ModuleSanitizerCoverage::instrumentModule() {
 
   // Loads.
   SanCovLoadFunction[0] = M.getOrInsertFunction(SanCovLoad1, VoidTy, PtrTy);
-  SanCovLoadFunction[1] =
-      M.getOrInsertFunction(SanCovLoad2, VoidTy, PtrTy);
-  SanCovLoadFunction[2] =
-      M.getOrInsertFunction(SanCovLoad4, VoidTy, PtrTy);
-  SanCovLoadFunction[3] =
-      M.getOrInsertFunction(SanCovLoad8, VoidTy, PtrTy);
-  SanCovLoadFunction[4] =
-      M.getOrInsertFunction(SanCovLoad16, VoidTy, PtrTy);
+  SanCovLoadFunction[1] = M.getOrInsertFunction(SanCovLoad2, VoidTy, PtrTy);
+  SanCovLoadFunction[2] = M.getOrInsertFunction(SanCovLoad4, VoidTy, PtrTy);
+  SanCovLoadFunction[3] = M.getOrInsertFunction(SanCovLoad8, VoidTy, PtrTy);
+  SanCovLoadFunction[4] = M.getOrInsertFunction(SanCovLoad16, VoidTy, PtrTy);
   // Stores.
-  SanCovStoreFunction[0] =
-      M.getOrInsertFunction(SanCovStore1, VoidTy, PtrTy);
-  SanCovStoreFunction[1] =
-      M.getOrInsertFunction(SanCovStore2, VoidTy, PtrTy);
-  SanCovStoreFunction[2] =
-      M.getOrInsertFunction(SanCovStore4, VoidTy, PtrTy);
-  SanCovStoreFunction[3] =
-      M.getOrInsertFunction(SanCovStore8, VoidTy, PtrTy);
-  SanCovStoreFunction[4] =
-      M.getOrInsertFunction(SanCovStore16, VoidTy, PtrTy);
+  SanCovStoreFunction[0] = M.getOrInsertFunction(SanCovStore1, VoidTy, PtrTy);
+  SanCovStoreFunction[1] = M.getOrInsertFunction(SanCovStore2, VoidTy, PtrTy);
+  SanCovStoreFunction[2] = M.getOrInsertFunction(SanCovStore4, VoidTy, PtrTy);
+  SanCovStoreFunction[3] = M.getOrInsertFunction(SanCovStore8, VoidTy, PtrTy);
+  SanCovStoreFunction[4] = M.getOrInsertFunction(SanCovStore16, VoidTy, PtrTy);
 
   {
     AttributeList AL;
@@ -514,6 +520,9 @@ bool ModuleSanitizerCoverage::instrumentModule() {
   SanCovTracePCGuard =
       M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, PtrTy);
 
+  SanCovStackDepthCallback =
+      M.getOrInsertFunction(SanCovStackDepthCallbackName, VoidTy);
+
   for (auto &F : M)
     instrumentFunction(F);
 
@@ -534,16 +543,16 @@ bool ModuleSanitizerCoverage::instrumentModule() {
   }
   if (Ctor && Options.PCTable) {
     auto SecStartEnd = CreateSecStartEnd(M, SanCovPCsSectionName, IntptrTy);
-    FunctionCallee InitFunction = declareSanitizerInitFunction(
-        M, SanCovPCsInitName, {PtrTy, PtrTy});
+    FunctionCallee InitFunction =
+        declareSanitizerInitFunction(M, SanCovPCsInitName, {PtrTy, PtrTy});
     IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
     IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
   }
 
   if (Ctor && Options.CollectControlFlow) {
     auto SecStartEnd = CreateSecStartEnd(M, SanCovCFsSectionName, IntptrTy);
-    FunctionCallee InitFunction = declareSanitizerInitFunction(
-        M, SanCovCFsInitName, {PtrTy, PtrTy});
+    FunctionCallee InitFunction =
+        declareSanitizerInitFunction(M, SanCovCFsInitName, {PtrTy, PtrTy});
     IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
     IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
   }
@@ -600,8 +609,8 @@ static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
 
   // Do not instrument full dominators, or full post-dominators with multiple
   // predecessors.
-  return !isFullDominator(BB, DT)
-    && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
+  return !isFullDominator(BB, DT) &&
+         !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
 }
 
 // Returns true iff From->To is a backedge.
@@ -776,16 +785,16 @@ ModuleSanitizerCoverage::CreatePCArray(Function &F,
   for (size_t i = 0; i < N; i++) {
     if (&F.getEntryBlock() == AllBlocks[i]) {
       PCs.push_back((Constant *)IRB.CreatePointerCast(&F, PtrTy));
-      PCs.push_back((Constant *)IRB.CreateIntToPtr(
-          ConstantInt::get(IntptrTy, 1), PtrTy));
+      PCs.push_back(
+          (Constant *)IRB.CreateIntToPtr(ConstantInt::get(IntptrTy, 1), PtrTy));
     } else {
       PCs.push_back((Constant *)IRB.CreatePointerCast(
           BlockAddress::get(AllBlocks[i]), PtrTy));
       PCs.push_back(Constant::getNullValue(PtrTy));
     }
   }
-  auto *PCArray = CreateFunctionLocalArrayInSection(N * 2, F, PtrTy,
-                                                    SanCovPCsSectionName);
+  auto *PCArray =
+      CreateFunctionLocalArrayInSection(N * 2, F, PtrTy, SanCovPCsSectionName);
   PCArray->setInitializer(
       ConstantArray::get(ArrayType::get(PtrTy, N * 2), PCs));
   PCArray->setConstant(true);
@@ -840,7 +849,8 @@ bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
                                              ArrayRef<BasicBlock *> AllBlocks,
                                              Value *&FunctionGateCmp,
                                              bool IsLeafFunc) {
-  if (AllBlocks.empty()) return false;
+  if (AllBlocks.empty())
+    return false;
   CreateFunctionLocalArrays(F, AllBlocks);
   for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
     InjectCoverageAtBlock(F, *AllBlocks[i], i, FunctionGateCmp, IsLeafFunc);
@@ -923,13 +933,14 @@ void ModuleSanitizerCoverage::InjectTraceForDiv(
   for (auto *BO : DivTraceTargets) {
     InstrumentationIRBuilder IRB(BO);
     Value *A1 = BO->getOperand(1);
-    if (isa<ConstantInt>(A1)) continue;
+    if (isa<ConstantInt>(A1))
+      continue;
     if (!A1->getType()->isIntegerTy())
       continue;
     uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType());
-    int CallbackIdx = TypeSize == 32 ? 0 :
-        TypeSize == 64 ? 1 : -1;
-    if (CallbackIdx < 0) continue;
+    int CallbackIdx = TypeSize == 32 ? 0 : TypeSize == 64 ? 1 : -1;
+    if (CallbackIdx < 0)
+      continue;
     auto Ty = Type::getIntNTy(*C, TypeSize);
     IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx],
                    {IRB.CreateIntCast(A1, Ty, true)});
@@ -987,17 +998,20 @@ void ModuleSanitizerCoverage::InjectTraceForCmp(
       if (!A0->getType()->isIntegerTy())
         continue;
       uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
-      int CallbackIdx = TypeSize == 8 ? 0 :
-                        TypeSize == 16 ? 1 :
-                        TypeSize == 32 ? 2 :
-                        TypeSize == 64 ? 3 : -1;
-      if (CallbackIdx < 0) continue;
+      int CallbackIdx = TypeSize == 8    ? 0
+                        : TypeSize == 16 ? 1
+                        : TypeSize == 32 ? 2
+                        : TypeSize == 64 ? 3
+                                         : -1;
+      if (CallbackIdx < 0)
+        continue;
       // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
       auto CallbackFunc = SanCovTraceCmpFunction[CallbackIdx];
       bool FirstIsConst = isa<ConstantInt>(A0);
       bool SecondIsConst = isa<ConstantInt>(A1);
       // If both are const, then we don't need such a comparison.
-      if (FirstIsConst && SecondIsConst) continue;
+      if (FirstIsConst && SecondIsConst)
+        continue;
       // If only one is const, then make it the first callback argument.
       if (FirstIsConst || SecondIsConst) {
         CallbackFunc = SanCovTraceConstCmpFunction[CallbackIdx];
@@ -1078,22 +1092,65 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     Store->setNoSanitizeMetadata();
   }
   if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
-    // Check stack depth.  If it's the deepest so far, record it.
     Module *M = F.getParent();
-    auto FrameAddrPtr = IRB.CreateIntrinsic(
-        Intrinsic::frameaddress,
-        IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()),
-        {Constant::getNullValue(Int32Ty)});
-    auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
-    auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack);
-    auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack);
-    auto ThenTerm = SplitBlockAndInsertIfThen(
-        IsStackLower, &*IP, false,
-        MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
-    IRBuilder<> ThenIRB(ThenTerm);
-    auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
-    LowestStack->setNoSanitizeMetadata();
-    Store->setNoSanitizeMetadata();
+    const DataLayout &DL = M->getDataLayout();
+
+    if (Options.StackDepthCallbackMin) {
+      // In callback mode, only add call when stack depth reaches minimum.
+      int EstimatedStackSize = 0;
+      // If dynamic alloca found, always add call.
+      bool HasDynamicAlloc = false;
+      // Find an insertion point after last "alloca".
+      llvm::Instruction *InsertBefore = nullptr;
+
+      // Examine all allocas in the basic block. since we're too early
+      // to have results from Intrinsic::frameaddress, we have to manually
+      // estimate the stack size.
+      for (auto &I : BB) {
+        if (auto *AI = dyn_cast<AllocaInst>(&I)) {
+          // Move potential insertion point past the "alloca".
+          InsertBefore = AI->getNextNode();
+
+          // Make an estimate on the stack usage.
+          if (AI->isStaticAlloca()) {
+            uint32_t Bytes = DL.getTypeAllocSize(AI->getAllocatedType());
+            if (AI->isArrayAllocation()) {
+              if (const ConstantInt *arraySize =
+                      dyn_cast<ConstantInt>(AI->getArraySize())) {
+                Bytes *= arraySize->getZExtValue();
+              } else {
+                HasDynamicAlloc = true;
+              }
+            }
+            EstimatedStackSize += Bytes;
+          } else {
+            HasDynamicAlloc = true;
+          }
+        }
+      }
+
+      if (HasDynamicAlloc ||
+          EstimatedStackSize >= Options.StackDepthCallbackMin) {
+        if (InsertBefore)
+          IRB.SetInsertPoint(InsertBefore);
+        IRB.CreateCall(SanCovStackDepthCallback)->setCannotMerge();
+      }
+    } else {
+      // Check stack depth.  If it's the deepest so far, record it.
+      auto FrameAddrPtr = IRB.CreateIntrinsic(
+          Intrinsic::frameaddress, IRB.getPtrTy(DL.getAllocaAddrSpace()),
+          {Constant::getNullValue(Int32Ty)});
+      auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
+      auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack);
+      auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack);
+      auto ThenTerm = SplitBlockAndInsertIfThen(
+          IsStackLower, &*IP, false,
+          MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
+      IRBuilder<> ThenIRB(ThenTerm);
+      auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+      LowestStack->setNoSanitizeMetadata();
+      Store->setNoSanitizeMetadata();
+    }
   }
 }
 
@@ -1136,13 +1193,13 @@ void ModuleSanitizerCoverage::createFunctionControlFlow(Function &F) {
     if (&BB == &F.getEntryBlock())
       CFs.push_back((Constant *)IRB.CreatePointerCast(&F, PtrTy));
     else
-      CFs.push_back((Constant *)IRB.CreatePointerCast(BlockAddress::get(&BB),
-                                                      PtrTy));
+      CFs.push_back(
+          (Constant *)IRB.CreatePointerCast(BlockAddress::get(&BB), PtrTy));
 
     for (auto SuccBB : successors(&BB)) {
       assert(SuccBB != &F.getEntryBlock());
-      CFs.push_back((Constant *)IRB.CreatePointerCast(BlockAddress::get(SuccBB),
-                                                      PtrTy));
+      CFs.push_back(
+          (Constant *)IRB.CreatePointerCast(BlockAddress::get(SuccBB), PtrTy));
     }
 
     CFs.push_back((Constant *)Constant::getNullValue(PtrTy));
@@ -1156,8 +1213,7 @@ void ModuleSanitizerCoverage::createFunctionControlFlow(Function &F) {
         } else {
           auto CalledF = CB->getCalledFunction();
           if (CalledF && !CalledF->isIntrinsic())
-            CFs.push_back(
-                (Constant *)IRB.CreatePointerCast(CalledF, PtrTy));
+            CFs.push_back((Constant *)IRB.CreatePointerCast(CalledF, PtrTy));
         }
       }
     }
@@ -1165,8 +1221,8 @@ void ModuleSanitizerCoverage::createFunctionControlFlow(Function &F) {
     CFs.push_back((Constant *)Constant::getNullValue(PtrTy));
   }
 
-  FunctionCFsArray = CreateFunctionLocalArrayInSection(
-      CFs.size(), F, PtrTy, SanCovCFsSectionName);
+  FunctionCFsArray = CreateFunctionLocalArrayInSection(CFs.size(), F, PtrTy,
+                                                       SanCovCFsSectionName);
   FunctionCFsArray->setInitializer(
       ConstantArray::get(ArrayType::get(PtrTy, CFs.size()), CFs));
   FunctionCFsArray->setConstant(true);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index ba598d8..1d5d7cc 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -2011,6 +2011,14 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
   }
 }
 
+static void remapSourceAtoms(ValueToValueMapTy &VM, BasicBlock::iterator Begin,
+                             BasicBlock::iterator End) {
+  if (VM.AtomMap.empty())
+    return;
+  for (auto It = Begin; It != End; ++It)
+    RemapSourceAtom(&*It, VM);
+}
+
 /// Clone instructions in range [BI, BE) to NewBB.  For PHI nodes, we only clone
 /// arguments that come from PredBB.  Return the map from the variables in the
 /// source basic block to the variables in the newly created basic block.
@@ -2075,6 +2083,8 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping,
     PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
     NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
     ValueMapping[PN] = NewPN;
+    if (const DebugLoc &DL = PN->getDebugLoc())
+      mapAtomInstance(DL, ValueMapping);
   }
 
   // Clone noalias scope declarations in the threaded block. When threading a
@@ -2103,6 +2113,8 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping,
     adaptNoAliasScopes(New, ClonedScopes, Context);
 
     CloneAndRemapDbgInfo(New, &*BI);
+    if (const DebugLoc &DL = New->getDebugLoc())
+      mapAtomInstance(DL, ValueMapping);
 
     if (RetargetDbgValueIfPossible(New))
       continue;
@@ -2330,6 +2342,9 @@ void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
        {DominatorTree::Insert, PredPredBB, NewBB},
        {DominatorTree::Delete, PredPredBB, PredBB}});
 
+  // Remap source location atoms beacuse we're duplicating control flow.
+  remapSourceAtoms(ValueMapping, NewBB->begin(), NewBB->end());
+
   updateSSA(PredBB, NewBB, ValueMapping);
 
   // Clean up things like PHI nodes with single operands, dead instructions,
@@ -2684,6 +2699,9 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
   // PredBB block.  Evaluate PHI nodes in BB.
   ValueToValueMapTy ValueMapping;
 
+  // Remember the position before the inserted instructions.
+  auto RItBeforeInsertPt = std::next(OldPredBranch->getReverseIterator());
+
   BasicBlock::iterator BI = BB->begin();
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
     ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
@@ -2703,6 +2721,8 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
 
     // Remap debug variable operands.
     remapDebugVariable(ValueMapping, New);
+    if (const DebugLoc &DL = New->getDebugLoc())
+      mapAtomInstance(DL, ValueMapping);
 
     // If this instruction can be simplified after the operands are updated,
     // just use the simplified value instead.  This frequently happens due to
@@ -2741,6 +2761,10 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
   addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
                                   ValueMapping);
 
+  // KeyInstructions: Remap the cloned instructions' atoms only.
+  remapSourceAtoms(ValueMapping, std::prev(RItBeforeInsertPt)->getIterator(),
+                   OldPredBranch->getIterator());
+
   updateSSA(BB, PredBB, ValueMapping);
 
   // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index cb7a9ef..97f9829 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -439,8 +439,7 @@ static bool LinearizeExprTree(Instruction *I,
     for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands.
       Value *Op = I->getOperand(OpIdx);
       LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
-      assert((!Op->hasUseList() || !Op->use_empty()) &&
-             "No uses, so how did we get to it?!");
+      assert(!Op->use_empty() && "No uses, so how did we get to it?!");
 
       // If this is a binary operation of the right kind with only one use then
       // add its operands to the expression.
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index eaf57e7..2412146 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1827,10 +1827,16 @@ static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt,
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
                              Instruction *TheCall, bool CalleeHasDebugInfo) {
-  const DebugLoc &TheCallDL = TheCall->getDebugLoc();
-  if (!TheCallDL)
+  if (!TheCall->getDebugLoc())
     return;
 
+  // Don't propagate the source location atom from the call to inlined nodebug
+  // instructions, and avoid putting it in the InlinedAt field of inlined
+  // not-nodebug instructions. FIXME: Possibly worth transferring/generating
+  // an atom for the returned value, otherwise we miss stepping on inlined
+  // nodebug functions (which is different to existing behaviour).
+  DebugLoc TheCallDL = TheCall->getDebugLoc().get()->getWithoutAtom();
+
   auto &Ctx = Fn->getContext();
   DILocation *InlinedAtNode = TheCallDL;
 
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index c5c4968..73d0740 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -657,6 +657,9 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
       // Otherwise, create a duplicate of the instruction.
       Instruction *C = Inst->clone();
+      if (const DebugLoc &DL = C->getDebugLoc())
+        mapAtomInstance(DL, ValueMap);
+
       C->insertBefore(LoopEntryBranch->getIterator());
 
       ++NumInstrsDuplicated;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 89709e1..0f7e15b 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3590,7 +3590,7 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
     // instructions into EdgeBB.  We know that there will be no uses of the
     // cloned instructions outside of EdgeBB.
     BasicBlock::iterator InsertPt = EdgeBB->getFirstInsertionPt();
-    DenseMap<Value *, Value *> TranslateMap; // Track translated values.
+    ValueToValueMapTy TranslateMap; // Track translated values.
     TranslateMap[Cond] = CB;
 
     // RemoveDIs: track instructions that we optimise away while folding, so
@@ -3610,11 +3610,11 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
         N->setName(BBI->getName() + ".c");
 
       // Update operands due to translation.
-      for (Use &Op : N->operands()) {
-        DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(Op);
-        if (PI != TranslateMap.end())
-          Op = PI->second;
-      }
+      // Key Instructions: Remap all the atom groups.
+      if (const DebugLoc &DL = BBI->getDebugLoc())
+        mapAtomInstance(DL, TranslateMap);
+      RemapInstruction(N, TranslateMap,
+                       RF_IgnoreMissingLocals | RF_NoModuleLevelChanges);
 
       // Check for trivial simplification.
       if (Value *V = simplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 981ff7f..1b06c8b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -249,6 +249,12 @@ public:
         new VPInstruction(Ptr, Offset, GEPNoWrapFlags::inBounds(), DL, Name));
   }
 
+  VPInstruction *createScalarPhi(ArrayRef<VPValue *> IncomingValues,
+                                 DebugLoc DL, const Twine &Name = "") {
+    return tryInsertInstruction(
+        new VPInstruction(Instruction::PHI, IncomingValues, DL, Name));
+  }
+
   /// Convert the input value \p Current to the corresponding value of an
   /// induction with \p Start and \p Step values, using \p Start + \p Current *
   /// \p Step.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7093d37..eba8b16 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2093,17 +2093,16 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
     VPValue *MaxEVL = &Plan.getVF();
     // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
+    VPBuilder Builder(LoopRegion->getPreheaderVPBB());
     if (unsigned VFSize =
             TypeInfo.inferScalarType(MaxEVL)->getScalarSizeInBits();
         VFSize != 32) {
-      VPBuilder Builder(LoopRegion->getPreheaderVPBB());
       MaxEVL = Builder.createScalarCast(
           VFSize > 32 ? Instruction::Trunc : Instruction::ZExt, MaxEVL,
           Type::getInt32Ty(Ctx), DebugLoc());
     }
-    PrevEVL = new VPInstruction(Instruction::PHI, {MaxEVL, &EVL}, DebugLoc(),
-                                "prev.evl");
-    PrevEVL->insertBefore(*Header, Header->getFirstNonPhi());
+    Builder.setInsertPoint(Header, Header->getFirstNonPhi());
+    PrevEVL = Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
   }
 
   for (VPUser *U : to_vector(Plan.getVF().users())) {
@@ -2433,10 +2432,10 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
         auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
         StringRef Name =
             isa<VPCanonicalIVPHIRecipe>(PhiR) ? "index" : "evl.based.iv";
-        auto *ScalarR = new VPInstruction(
-            Instruction::PHI, {PhiR->getStartValue(), PhiR->getBackedgeValue()},
+        VPBuilder Builder(PhiR);
+        auto *ScalarR = Builder.createScalarPhi(
+            {PhiR->getStartValue(), PhiR->getBackedgeValue()},
             PhiR->getDebugLoc(), Name);
-        ScalarR->insertBefore(PhiR);
         PhiR->replaceAllUsesWith(ScalarR);
         ToRemove.push_back(PhiR);
         continue;
diff --git a/llvm/test/Analysis/CostModel/ARM/memcpy.ll b/llvm/test/Analysis/CostModel/ARM/memcpy.ll
index 1a66b2b..f397397 100644
--- a/llvm/test/Analysis/CostModel/ARM/memcpy.ll
+++ b/llvm/test/Analysis/CostModel/ARM/memcpy.ll
@@ -11,6 +11,22 @@ target triple = "thumbv7m-arm-unknown-eabi"
 ; Align 1, 1
 ;;;;;;;;;;;;
 
+define void @memcpy_0(ptr %d, ptr %s) {
+;
+; with/without strict-align:
+;
+; ldrb r1, [r1]
+; strb r1, [r0]
+;
+; COMMON-LABEL: 'memcpy_0'
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr %d, ptr %s, i32 1, i1 false)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+entry:
+  call void @llvm.memcpy.p0.p0.i32(ptr %d, ptr %s, i32 1, i1 false)
+  ret void
+}
+
 define void @memcpy_1(ptr %d, ptr %s) {
 ;
 ; with/without strict-align:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
index 8c7df4b..0d1b082 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
@@ -106,13 +106,12 @@ exit:
   ret void
 }
 
-define void @unknown_dep_not_known_safe_due_to_backedge_taken_count(ptr %A) {
-; CHECK-LABEL: 'unknown_dep_not_known_safe_due_to_backedge_taken_count'
+define void @backward_dep_known_distance_less_than_btc(ptr %A) {
+; CHECK-LABEL: 'backward_dep_known_distance_less_than_btc'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 8160 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
new file mode 100644
index 0000000..c5f31de
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<access-info>" %s 2>&1 | FileCheck %s
+
+@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
+
+; Generated from the following C code:
+; #define LEN 256 * 256
+; float a[LEN];
+;
+; void different_strides() {
+;   for (int i = 0; i < LEN - 1024 - 255; i++) {
+;   #pragma clang loop interleave(disable)
+;   #pragma clang loop unroll(disable)
+;     for (int j = 0; j < 256; j++)
+;       a[i + j + 1024] += a[j * 4 + i];
+;   }
+; }
+; The load and store have different strides(4 and 16 bytes respectively) but the store
+; is always at safe positive distance away from the load, thus BackwardVectorizable
+define void @different_strides_backward_vectorizable() {
+; CHECK-LABEL: 'different_strides_backward_vectorizable'
+; CHECK-NEXT:    inner.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 2048 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    outer.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
+  %0 = add nuw nsw i64 %i, 1024
+  br label %inner.body
+
+inner.body:
+  %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+  %1 = shl nuw nsw i64 %j, 2
+  %2 = add nuw nsw i64 %1, %i
+  %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+  %3 = load float, ptr %arrayidx, align 4
+  %4 = add nuw nsw i64 %0, %j
+  %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd fast float %5, %3
+  store float %add9, ptr %arrayidx8, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.not = icmp eq i64 %j.next, 256
+  br i1 %exitcond.not, label %outer.latch, label %inner.body
+
+outer.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %outerexitcond.not = icmp eq i64 %i.next, 64257
+  br i1 %outerexitcond.not, label %exit, label %outer.header
+
+exit:
+  ret void
+}
+
+
+; Generated from following C code:
+; void different_stride_and_not_vectorizable(){
+;    for(int i = 0; i < LEN2; i++){
+;        for(int j = 0 ; j < LEN; j++){
+;            a[i + j + LEN] += a[i + 4*j];
+;        }
+;    }
+; }
+; The load and store have different strides, but the store and load are not at a
+; safe distance away from each other, thus not safe for vectorization.
+define void @different_stride_and_not_vectorizable() {
+; CHECK-LABEL: 'different_stride_and_not_vectorizable'
+; CHECK-NEXT:    inner.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    outer.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
+  %0 = add nuw nsw i64 %i, 256
+  br label %inner.body
+
+inner.body:
+  %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+  %1 = shl nuw nsw i64 %j, 2
+  %2 = add nuw nsw i64 %1, %i
+  %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+  %3 = load float, ptr %arrayidx, align 4
+  %4 = add nuw nsw i64 %0, %j
+  %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd fast float %5, %3
+  store float %add9, ptr %arrayidx8, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.not = icmp eq i64 %j.next, 256
+  br i1 %exitcond.not, label %outer.latch, label %inner.body
+
+outer.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond29.not = icmp eq i64 %i.next, 65536
+  br i1 %exitcond29.not, label %exit, label %outer.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
index 416742a..d263749 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
@@ -45,10 +45,9 @@ exit:
 define void @different_non_constant_strides_known_backward_distance_larger_than_trip_count(ptr %A) {
 ; CHECK-LABEL: 'different_non_constant_strides_known_backward_distance_larger_than_trip_count'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 4096 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
 ; CHECK-EMPTY:
@@ -83,10 +82,9 @@ exit:
 define void @different_non_constant_strides_known_backward_min_distance_16(ptr %A) {
 ; CHECK-LABEL: 'different_non_constant_strides_known_backward_min_distance_16'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/MemorySSA/nondeterminism.ll b/llvm/test/Analysis/MemorySSA/nondeterminism.ll
index 11b9703..90902e3 100644
--- a/llvm/test/Analysis/MemorySSA/nondeterminism.ll
+++ b/llvm/test/Analysis/MemorySSA/nondeterminism.ll
@@ -1,6 +1,7 @@
 ; RUN: opt -passes=simplifycfg -S --preserve-ll-uselistorder %s | FileCheck %s
 ; REQUIRES: x86-registered-target
 ; CHECK-LABEL: @n
+; CHECK: uselistorder i16 0, { 3, 2, 4, 1, 5, 0, 6 }
 
 ; Note: test was added in an effort to ensure determinism when updating memoryssa. See PR42574.
 ; If the uselistorder check becomes no longer relevant, the test can be disabled or removed.
diff --git a/llvm/test/Bindings/llvm-c/atomics.ll b/llvm/test/Bindings/llvm-c/atomics.ll
index 588bd24..c9f50cc 100644
--- a/llvm/test/Bindings/llvm-c/atomics.ll
+++ b/llvm/test/Bindings/llvm-c/atomics.ll
@@ -54,6 +54,8 @@ define void @atomic_rmw_ops(ptr %p, i32 %i, float %f) {
   %a.fsub      = atomicrmw fsub      ptr %p, float %f acq_rel, align 8
   %a.fmax      = atomicrmw fmax      ptr %p, float %f acq_rel, align 8
   %a.fmin      = atomicrmw fmin      ptr %p, float %f acq_rel, align 8
+  %a.fmaximum  = atomicrmw fmaximum  ptr %p, float %f acq_rel, align 8
+  %a.fminimum  = atomicrmw fminimum  ptr %p, float %f acq_rel, align 8
 
   %a.uinc_wrap = atomicrmw uinc_wrap ptr %p, i32 %i acq_rel, align 8
   %a.udec_wrap = atomicrmw udec_wrap ptr %p, i32 %i acq_rel, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 493e8ce..f81d7f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -17,8 +17,7 @@ define i8 @v_ashr_i8(i8 %value, i8 %amount) {
 ; GFX8-LABEL: v_ashr_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8:
@@ -49,8 +48,8 @@ define i8 @v_ashr_i8_7(i8 %value) {
 ; GFX8-LABEL: v_ashr_i8_7:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 7
+; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8_7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
index a9fe80e..2b911b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-abs.mir
@@ -144,11 +144,9 @@ body: |
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[ASHR]]
+    ; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; VI-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[TRUNC]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ABS]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
index f4aaab7..53905a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
@@ -319,12 +319,10 @@ body: |
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-    ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-    ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [[AND]](s16)
-    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16)
+    ; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC1]], [[AND]](s16)
+    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; GFX9PLUS-LABEL: name: test_ashr_i8_i8
@@ -374,12 +372,10 @@ body: |
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-    ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-    ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [[AND]](s16)
-    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16)
+    ; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7
+    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC1]], [[AND]](s16)
+    ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
     ; GFX9PLUS-LABEL: name: test_ashr_s7_s7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir
index 40c48e1..7a18843 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir
@@ -419,30 +419,26 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
-    ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
     ;
     ; GFX8-LABEL: name: test_sext_inreg_s16_1
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; GFX8-NEXT: S_ENDPGM 0, implicit [[ASHR]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
+    ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
     ;
     ; GFX6-LABEL: name: test_sext_inreg_s16_1
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
-    ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
-    ; GFX6-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16)
+    ; GFX6-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_TRUNC %0
     %2:_(s16) = G_SEXT_INREG %1, 1
-    S_ENDPGM 0, implicit %2
+    %3:_(s32) = G_ANYEXT %2
+    $vgpr0 = COPY %3
 
 ...
 
@@ -457,30 +453,94 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 15
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
-    ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
     ;
     ; GFX8-LABEL: name: test_sext_inreg_s16_15
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; GFX8-NEXT: S_ENDPGM 0, implicit [[ASHR]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 15
+    ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
     ;
     ; GFX6-LABEL: name: test_sext_inreg_s16_15
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX6-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 15
-    ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
-    ; GFX6-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16)
+    ; GFX6-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_TRUNC %0
     %2:_(s16) = G_SEXT_INREG %1, 15
-    S_ENDPGM 0, implicit %2
+    %3:_(s32) = G_ANYEXT %2
+    $vgpr0 = COPY %3
+
+...
+
+---
+name: test_sext_inreg_s8_1
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX9-LABEL: name: test_sext_inreg_s8_1
+    ; GFX9: liveins: $vgpr0
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
+    ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
+    ; GFX8-LABEL: name: test_sext_inreg_s8_1
+    ; GFX8: liveins: $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
+    ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
+    ; GFX6-LABEL: name: test_sext_inreg_s8_1
+    ; GFX6: liveins: $vgpr0
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX6-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
+    ; GFX6-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s8) = G_TRUNC %0
+    %2:_(s8) = G_SEXT_INREG %1, 1
+    %3:_(s32) = G_ANYEXT %2
+    $vgpr0 = COPY %3
+
+...
+
+---
+name: test_sext_inreg_s16_7
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX9-LABEL: name: test_sext_inreg_s16_7
+    ; GFX9: liveins: $vgpr0
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7
+    ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
+    ; GFX8-LABEL: name: test_sext_inreg_s16_7
+    ; GFX8: liveins: $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7
+    ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
+    ; GFX6-LABEL: name: test_sext_inreg_s16_7
+    ; GFX6: liveins: $vgpr0
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX6-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7
+    ; GFX6-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s8) = G_TRUNC %0
+    %2:_(s8) = G_SEXT_INREG %1, 7
+    %3:_(s32) = G_ANYEXT %2
+    $vgpr0 = COPY %3
 
 ...
 
@@ -821,19 +881,15 @@ body: |
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C1]](s16)
-    ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR]](s16)
-    ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR1]](s16)
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 1
+    ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 1
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG]], [[C1]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG1]], [[C1]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
@@ -907,38 +963,31 @@ body: |
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>)
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR2:%[0-9]+]]:_(s16) = G_ASHR [[SHL2]], [[C1]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 1
+    ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 1
+    ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST1]], 1
     ; GFX8-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX8-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
-    ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR]](s16)
-    ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR1]](s16)
-    ; GFX8-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG]], [[C1]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG1]], [[C1]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR2]](s16)
-    ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]]
+    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG2]], [[C1]]
+    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C2]]
-    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL5]]
+    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -1101,32 +1150,24 @@ body: |
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR2:%[0-9]+]]:_(s16) = G_ASHR [[SHL2]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR3:%[0-9]+]]:_(s16) = G_ASHR [[SHL3]], [[C1]](s16)
-    ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR]](s16)
-    ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR1]](s16)
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 1
+    ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 1
+    ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST1]], 1
+    ; GFX8-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 1
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG]], [[C1]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG1]], [[C1]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR2]](s16)
-    ; GFX8-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR3]](s16)
-    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
+    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG2]], [[C1]]
+    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG3]], [[C1]]
+    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1188,45 +1229,33 @@ body: |
     ; GFX8: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF
     ; GFX8-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>)
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
-    ; GFX8-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
     ; GFX8-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
-    ; GFX8-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR2:%[0-9]+]]:_(s16) = G_ASHR [[SHL2]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR3:%[0-9]+]]:_(s16) = G_ASHR [[SHL3]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR4:%[0-9]+]]:_(s16) = G_ASHR [[SHL4]], [[C1]](s16)
-    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C1]](s16)
-    ; GFX8-NEXT: [[ASHR5:%[0-9]+]]:_(s16) = G_ASHR [[SHL5]], [[C1]](s16)
-    ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR]](s16)
-    ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR1]](s16)
-    ; GFX8-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]]
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 1
+    ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 1
+    ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST1]], 1
+    ; GFX8-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 1
+    ; GFX8-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST2]], 1
+    ; GFX8-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 1
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG]], [[C1]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG1]], [[C1]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR2]](s16)
-    ; GFX8-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR3]](s16)
-    ; GFX8-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]]
+    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG2]], [[C1]]
+    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG3]], [[C1]]
+    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR4]](s16)
-    ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ASHR5]](s16)
-    ; GFX8-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]]
+    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG4]], [[C1]]
+    ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG5]], [[C1]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir
index 847ffc8..a813286 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir
@@ -240,11 +240,9 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[ASHR]](s16)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16)
     %0:_(s32) = COPY $vgpr0
     %1:_(s8) = G_TRUNC %0
     %2:_(s16) = G_SEXT %1
@@ -653,66 +651,65 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C3]](s16)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 7
-    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [[C4]](s16)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 7
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[C3]](s16)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C5]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C5]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C5]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C5]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]]
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C4]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL5]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL6]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL7]]
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL8]]
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]]
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C5]]
-    ; CHECK-NEXT: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C3]](s16)
-    ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]]
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]]
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]]
+    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]]
+    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C5]](s16)
+    ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]]
     ; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
     ; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
-    ; CHECK-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL11]]
+    ; CHECK-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]]
     ; CHECK-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16)
     ; CHECK-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
-    ; CHECK-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL12]]
+    ; CHECK-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]]
     ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32)
     ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s704) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64), [[MV1]](s64)
     ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s88) = G_TRUNC [[MV2]](s704)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
index 34daf8e..52b002f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+    ;
     ; VI-LABEL: name: test_smax_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+    ;
     ; GFX9-LABEL: name: test_smax_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -52,6 +54,7 @@ body: |
     ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; VI-LABEL: name: test_smax_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -60,6 +63,7 @@ body: |
     ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: test_smax_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -89,6 +93,7 @@ body: |
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16
     ; SI-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+    ;
     ; VI-LABEL: name: test_smax_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -99,6 +104,7 @@ body: |
     ; VI-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smax_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -133,21 +139,20 @@ body: |
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
     ; SI-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+    ;
     ; VI-LABEL: name: test_smax_s8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16)
-    ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C]](s16)
-    ; VI-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[ASHR]], [[ASHR1]]
+    ; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; VI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+    ; VI-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smax_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -184,6 +189,7 @@ body: |
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
     ; SI-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+    ;
     ; VI-LABEL: name: test_smax_s17
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -193,6 +199,7 @@ body: |
     ; VI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
     ; VI-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
     ; VI-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+    ;
     ; GFX9-LABEL: name: test_smax_s17
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -228,6 +235,7 @@ body: |
     ; SI-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_smax_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -239,6 +247,7 @@ body: |
     ; VI-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smax_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -274,6 +283,7 @@ body: |
     ; SI-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_smax_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -286,6 +296,7 @@ body: |
     ; VI-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smax_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -333,6 +344,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_smax_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -355,6 +367,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_smax_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -399,6 +412,7 @@ body: |
     ; SI-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG4]], [[SEXT_INREG5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32)
     ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_smax_v3s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -427,6 +441,7 @@ body: |
     ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX2]](s16)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smax_v3s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -500,6 +515,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_smax_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -540,6 +556,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_smax_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
index 90bb012..a59b608 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+    ;
     ; VI-LABEL: name: test_smin_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+    ;
     ; GFX9-LABEL: name: test_smin_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -52,6 +54,7 @@ body: |
     ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; VI-LABEL: name: test_smin_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -60,6 +63,7 @@ body: |
     ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: test_smin_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -89,6 +93,7 @@ body: |
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16
     ; SI-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+    ;
     ; VI-LABEL: name: test_smin_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -99,6 +104,7 @@ body: |
     ; VI-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smin_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -133,21 +139,20 @@ body: |
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
     ; SI-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+    ;
     ; VI-LABEL: name: test_smin_s8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16)
-    ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C]](s16)
-    ; VI-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[ASHR]], [[ASHR1]]
+    ; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; VI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+    ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+    ; VI-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smin_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -184,6 +189,7 @@ body: |
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
     ; SI-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
     ; SI-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+    ;
     ; VI-LABEL: name: test_smin_s17
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -193,6 +199,7 @@ body: |
     ; VI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
     ; VI-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
     ; VI-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+    ;
     ; GFX9-LABEL: name: test_smin_s17
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -228,6 +235,7 @@ body: |
     ; SI-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_smin_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -239,6 +247,7 @@ body: |
     ; VI-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smin_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -274,6 +283,7 @@ body: |
     ; SI-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_smin_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -286,6 +296,7 @@ body: |
     ; VI-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smin_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -333,6 +344,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_smin_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -355,6 +367,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_smin_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -399,6 +412,7 @@ body: |
     ; SI-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG4]], [[SEXT_INREG5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32)
     ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_smin_v3s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -427,6 +441,7 @@ body: |
     ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN2]](s16)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smin_v3s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -500,6 +515,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_smin_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -540,6 +556,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_smin_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
index 51fffb7..cd69104 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
@@ -115,18 +115,16 @@ body: |
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+    ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]]
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C]](s16)
-    ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[ASHR]], [[ASHR1]]
-    ; GFX8-NEXT: [[ASHR2:%[0-9]+]]:_(s16) = G_ASHR [[MUL]], [[C]](s16)
-    ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR2]](s16)
-    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT]], 8
-    ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[MUL]], [[C]](s16)
+    ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT]], 8
+    ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32)
     ;
     ; GFX9-LABEL: name: test_smulh_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -227,28 +225,24 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 8
+    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+    ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]]
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
-    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C]](s16)
-    ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[ASHR]], [[ASHR1]]
-    ; GFX8-NEXT: [[ASHR2:%[0-9]+]]:_(s16) = G_ASHR [[MUL]], [[C]](s16)
-    ; GFX8-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR3:%[0-9]+]]:_(s16) = G_ASHR [[SHL2]], [[C]](s16)
-    ; GFX8-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
-    ; GFX8-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C]](s16)
-    ; GFX8-NEXT: [[ASHR4:%[0-9]+]]:_(s16) = G_ASHR [[SHL3]], [[C]](s16)
-    ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[ASHR3]], [[ASHR4]]
-    ; GFX8-NEXT: [[ASHR5:%[0-9]+]]:_(s16) = G_ASHR [[MUL1]], [[C]](s16)
+    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[MUL]], [[C]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+    ; GFX8-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG2]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 8
+    ; GFX8-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG3]](s32)
+    ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[TRUNC2]], [[TRUNC3]]
+    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[MUL1]], [[C]](s16)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[ASHR2]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[ASHR5]], [[C1]]
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C]](s16)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]]
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[ASHR]], [[C1]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[ASHR1]], [[C1]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C]](s16)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ;
@@ -322,54 +316,46 @@ body: |
     ; GFX8-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; GFX8-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C1]](s32)
     ; GFX8-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+    ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[TRUNC]], [[TRUNC1]]
     ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C3]](s16)
-    ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SHL1]], [[C3]](s16)
-    ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[ASHR]], [[ASHR1]]
-    ; GFX8-NEXT: [[ASHR2:%[0-9]+]]:_(s16) = G_ASHR [[MUL]], [[C3]](s16)
-    ; GFX8-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR3:%[0-9]+]]:_(s16) = G_ASHR [[SHL2]], [[C3]](s16)
-    ; GFX8-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
-    ; GFX8-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR4:%[0-9]+]]:_(s16) = G_ASHR [[SHL3]], [[C3]](s16)
-    ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[ASHR3]], [[ASHR4]]
-    ; GFX8-NEXT: [[ASHR5:%[0-9]+]]:_(s16) = G_ASHR [[MUL1]], [[C3]](s16)
-    ; GFX8-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR6:%[0-9]+]]:_(s16) = G_ASHR [[SHL4]], [[C3]](s16)
-    ; GFX8-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
-    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR7:%[0-9]+]]:_(s16) = G_ASHR [[SHL5]], [[C3]](s16)
-    ; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s16) = G_MUL [[ASHR6]], [[ASHR7]]
-    ; GFX8-NEXT: [[ASHR8:%[0-9]+]]:_(s16) = G_ASHR [[MUL2]], [[C3]](s16)
-    ; GFX8-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; GFX8-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[TRUNC6]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR9:%[0-9]+]]:_(s16) = G_ASHR [[SHL6]], [[C3]](s16)
-    ; GFX8-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
-    ; GFX8-NEXT: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[TRUNC7]], [[C3]](s16)
-    ; GFX8-NEXT: [[ASHR10:%[0-9]+]]:_(s16) = G_ASHR [[SHL7]], [[C3]](s16)
-    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s16) = G_MUL [[ASHR9]], [[ASHR10]]
-    ; GFX8-NEXT: [[ASHR11:%[0-9]+]]:_(s16) = G_ASHR [[MUL3]], [[C3]](s16)
-    ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR2]](s16)
+    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[MUL]], [[C3]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 8
+    ; GFX8-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG2]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR3]], 8
+    ; GFX8-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG3]](s32)
+    ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[TRUNC2]], [[TRUNC3]]
+    ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[MUL1]], [[C3]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 8
+    ; GFX8-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG4]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR4]], 8
+    ; GFX8-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG5]](s32)
+    ; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s16) = G_MUL [[TRUNC4]], [[TRUNC5]]
+    ; GFX8-NEXT: [[ASHR2:%[0-9]+]]:_(s16) = G_ASHR [[MUL2]], [[C3]](s16)
+    ; GFX8-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 8
+    ; GFX8-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG6]](s32)
+    ; GFX8-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR5]], 8
+    ; GFX8-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG7]](s32)
+    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s16) = G_MUL [[TRUNC6]], [[TRUNC7]]
+    ; GFX8-NEXT: [[ASHR3:%[0-9]+]]:_(s16) = G_ASHR [[MUL3]], [[C3]](s16)
+    ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
-    ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR5]](s16)
+    ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C4]]
-    ; GFX8-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL8]]
-    ; GFX8-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR8]](s16)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR2]](s16)
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C4]]
-    ; GFX8-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL9]]
-    ; GFX8-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR11]](s16)
+    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; GFX8-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR3]](s16)
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C4]]
-    ; GFX8-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL10]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
     ; GFX8-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; GFX9-LABEL: name: test_smulh_v4s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 18a222e..7ec27f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -197,35 +197,13 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 }
 
 define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
-; GFX6-LABEL: abs_sgpr_v2i8:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i8 s0, s0
-; GFX6-NEXT:    s_sext_i32_i8 s1, s1
-; GFX6-NEXT:    s_abs_i32 s0, s0
-; GFX6-NEXT:    s_abs_i32 s1, s1
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_v2i8:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_abs_i32 s0, s0
-; GFX8-NEXT:    s_abs_i32 s1, s1
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_v2i8:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10-NEXT:    s_sext_i32_i8 s1, s1
-; GFX10-NEXT:    s_abs_i32 s0, s0
-; GFX10-NEXT:    s_abs_i32 s1, s1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX-LABEL: abs_sgpr_v2i8:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_sext_i32_i8 s0, s0
+; GFX-NEXT:    s_sext_i32_i8 s1, s1
+; GFX-NEXT:    s_abs_i32 s0, s0
+; GFX-NEXT:    s_abs_i32 s1, s1
+; GFX-NEXT:    ; return to shader part epilog
   %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
   ret <2 x i8> %res
 }
@@ -245,13 +223,11 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ;
 ; GFX8-LABEL: abs_vgpr_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -272,44 +248,15 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 }
 
 define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
-; GFX6-LABEL: abs_sgpr_v3i8:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i8 s0, s0
-; GFX6-NEXT:    s_sext_i32_i8 s1, s1
-; GFX6-NEXT:    s_sext_i32_i8 s2, s2
-; GFX6-NEXT:    s_abs_i32 s0, s0
-; GFX6-NEXT:    s_abs_i32 s1, s1
-; GFX6-NEXT:    s_abs_i32 s2, s2
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_v3i8:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
-; GFX8-NEXT:    s_ashr_i32 s2, s2, 8
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_abs_i32 s0, s0
-; GFX8-NEXT:    s_abs_i32 s1, s1
-; GFX8-NEXT:    s_abs_i32 s2, s2
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_v3i8:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10-NEXT:    s_sext_i32_i8 s1, s1
-; GFX10-NEXT:    s_sext_i32_i8 s2, s2
-; GFX10-NEXT:    s_abs_i32 s0, s0
-; GFX10-NEXT:    s_abs_i32 s1, s1
-; GFX10-NEXT:    s_abs_i32 s2, s2
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX-LABEL: abs_sgpr_v3i8:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_sext_i32_i8 s0, s0
+; GFX-NEXT:    s_sext_i32_i8 s1, s1
+; GFX-NEXT:    s_sext_i32_i8 s2, s2
+; GFX-NEXT:    s_abs_i32 s0, s0
+; GFX-NEXT:    s_abs_i32 s1, s1
+; GFX-NEXT:    s_abs_i32 s2, s2
+; GFX-NEXT:    ; return to shader part epilog
   %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
   ret <3 x i8> %res
 }
@@ -333,16 +280,13 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ;
 ; GFX8-LABEL: abs_vgpr_v3i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/selected-inst-flags.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/selected-inst-flags.mir
new file mode 100644
index 0000000..c87284f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/selected-inst-flags.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-- -run-pass=instruction-select -o - %s | FileCheck %s
+
+# Checks MI Flags are preserved on selected instructions.
+
+---
+name:            s_or_i32_disjoint
+tracksRegLiveness: true
+regBankSelected: true
+legalized: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+
+    ; CHECK-LABEL: name: s_or_i32_disjoint
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY]], [[COPY1]], implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr0 = COPY [[S_OR_B32_]]
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = COPY $sgpr1
+    %2:sgpr(s32) = disjoint G_OR %0, %1
+    $sgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index 8300e25..1319701 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -511,24 +511,11 @@ define amdgpu_ps <16 x i32> @s_sext_inreg_v16i32_3(<16 x i32> inreg %value) {
 }
 
 define i16 @v_sext_inreg_i16_4(i16 %value) {
-; GFX6-LABEL: v_sext_inreg_i16_4:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 12
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_sext_inreg_i16_4:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 4, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_sext_inreg_i16_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 12
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_sext_inreg_i16_4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 12
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i16_4:
 ; GFX10PLUS:       ; %bb.0:
@@ -541,24 +528,11 @@ define i16 @v_sext_inreg_i16_4(i16 %value) {
 }
 
 define i16 @v_sext_inreg_i16_15(i16 %value) {
-; GFX6-LABEL: v_sext_inreg_i16_15:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_sext_inreg_i16_15:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 15, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_sext_inreg_i16_15:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_sext_inreg_i16_15:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i16_15:
 ; GFX10PLUS:       ; %bb.0:
@@ -643,12 +617,10 @@ define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) {
 ; GFX8-LABEL: v_sext_inreg_v2i16_8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, sext(v0), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX8-NEXT:    v_and_b32_sdwa v2, sext(v0), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v2i16_8:
@@ -680,12 +652,11 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) {
 ; GFX8-LABEL: v_sext_inreg_v2i16_15:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 15
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 15, v0
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v1
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 1
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v2i16_15:
@@ -798,17 +769,16 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) {
 ; GFX8-LABEL: v_sext_inreg_v4i16_3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, 3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 3, v0
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 3, v1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 3, v2
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 3, v4
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 13
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 13
+; GFX8-NEXT:    v_bfe_i32 v3, v1, 0, 13
+; GFX8-NEXT:    v_bfe_i32 v1, v1, 16, 13
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v4i16_3:
@@ -982,27 +952,26 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX8-LABEL: v_sext_inreg_v8i16_11:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, 11
-; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 11, v0
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 11, v1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 11, v4
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 11, v2
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 11, v6
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 11, v3
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 11, v7
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 11, v8
-; GFX8-NEXT:    v_ashrrev_i16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_bfe_i32 v4, v0, 0, 5
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 5
+; GFX8-NEXT:    v_bfe_i32 v5, v1, 0, 5
+; GFX8-NEXT:    v_bfe_i32 v1, v1, 16, 5
+; GFX8-NEXT:    v_bfe_i32 v6, v2, 0, 5
+; GFX8-NEXT:    v_bfe_i32 v2, v2, 16, 5
+; GFX8-NEXT:    v_bfe_i32 v7, v3, 0, 5
+; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 5
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v8i16_11:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index b19a5a4..8788dc2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -112618,575 +112618,570 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB51_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v16
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v14
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v17 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v32, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v23
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v33, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v16 :: v_dual_lshlrev_b32 v35, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v32, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v17, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v39, v33, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v48, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v16, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v102, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v16, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v36, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v39, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v34, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v16, v16, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v38, v32, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v35.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_lshlrev_b32 v83, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v37, v32 :: v_dual_add_f32 v37, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v34, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v35, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v19
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v48, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
-; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v33.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v37, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v48, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_cndmask_b32 v34, v34, v38
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v34, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v34.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v36, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v21
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v49, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v37, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v49, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v37
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v39 :: v_dual_lshlrev_b32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_lshlrev_b32 v87, 16, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v38, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v21
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v35.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v38, v36 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v96, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v37, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v39, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v39
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_add_f32 v49, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v34, v34, v35 :: v_dual_lshlrev_b32 v21, 16, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v39, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v48
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v21, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v21
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v48, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v21, v37, v49 :: v_dual_and_b32 v22, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v97, 0x400000, v87
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v37, v38, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v49, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v51, 0x40c00000, v22 :: v_dual_lshlrev_b32 v48, 16, v23
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v22, v39, v50 :: v_dual_and_b32 v23, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v51, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v51
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v49, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v49
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v50, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v22, v37, v38 :: v_dual_lshlrev_b32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v50, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v50
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_lshlrev_b32 v51, 16, v24
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v22.h
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v39, v49, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v50, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v52 :: v_dual_lshlrev_b32 v52, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v24, v38, v39 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v48, v50, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v50
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v51, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v52
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v48, v51, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v51
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v38
-; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v51, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v23
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v38, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v21.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_cndmask_b32 v23, v37, v50
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v48, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v48
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v24
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v52, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v52
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v53
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v50, v49, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v49
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v52, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v52
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v50, v52, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v49, v48, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v52, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v39, 0x40c00000, v39 :: v_dual_cndmask_b32 v38, v38, v50
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v53, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v39, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v39
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v38.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v24, v49, v51 :: v_dual_lshlrev_b32 v49, 16, v25
-; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v48, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v26
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v53, 0x40c00000, v53 :: v_dual_add_f32 v54, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v50, v51, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v53
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v54, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v50, v39, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v49 :: v_dual_cndmask_b32 v48, v48, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v25
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v48.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v50, v52, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v54, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v26
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v49, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_cndmask_b32 v26, v50, v51
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v55, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v50, v54, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v48
-; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v49, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v49
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v50, v52, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v49, v51, v48 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v50, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v55, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v55
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v53
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v26, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v26
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v52, v53, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v26, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v51, v54, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v27.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v28, v50, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v53
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v54, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v26.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v53, 0x40c00000, v53 :: v_dual_add_f32 v50, 0x40c00000, v50
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v27
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v52, v55, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v50, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v64, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v64
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v50
-; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v50, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v52, v64, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v53
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v54, v53, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v52.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v55, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v37.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v52, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v54, v54, v53, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v51, v55, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v65, 16, 1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v65
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v65, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_cndmask_b32 v51, v51, v55
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v66, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v51.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v54, v64, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v53, v66, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v30
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v53, v53, v64 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v52, 0x40c00000, v52 :: v_dual_add_f32 v67, 0x40c00000, v30
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v52, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v52
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v55, v52, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v53.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v51, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v53, v55, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v55
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v54, 0x40c00000, v65 :: v_dual_lshlrev_b32 v65, 16, v30
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v50.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v54, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v65
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v64, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v53, v64, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v64
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v66
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v30, v55, v65 :: v_dual_and_b32 v31, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v67, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v67
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v53 :: v_dual_add_f32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v55, v67, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v31
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v55, v55, v65 :: v_dual_add_f32 v54, 0x40c00000, v54
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v54, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v30, v52, v53 :: v_dual_and_b32 v31, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v55, v54, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v54
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v54, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v64, v53, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v31, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v31, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v65, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v65
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v55, v65, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v64, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v66
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v69, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v64, v67, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v54, v55, v64, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v64
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v31, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v65, v66, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v55, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v69
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v67, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v68
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v67, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v67, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v67
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v69
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v31.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v65, v68, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v69, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v67, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v69, 0x40c00000, v69
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v68
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v55.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v68, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v65, v68, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v68
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v70
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v64, v65 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v66, v67, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v67
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v69, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v67, v68, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v66, v69, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v66, 0x400000, v69
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
-; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v55, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v55
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v65, v69, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v65, v67, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v70, 0x40c00000, v1 :: v_dual_cndmask_b32 v1, v64, v68
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_add_f32 v65, 0x40c00000, v65
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v70, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v70
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v70, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v67, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v65, v66, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v66, v67, v68, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v68
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v70
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v66, v66, v67, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v69, v70, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v66
-; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v70, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v71, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v67, v66, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v65
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v64, v68, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v71, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v68, v65, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v64.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v67, v69 :: v_dual_lshlrev_b32 v67, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v66, v66, v71, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v71, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v69, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v68, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v70, v71, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v71
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v68, v65, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v66, v66, v69 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v67
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v68, v70, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v67, 16, 1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v69, v67, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v67
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v3, v70, v71
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v66, 0x40c00000, v66
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v69, v80, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v4, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v66, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v80, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v68, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v70, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v66
-; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v69, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v66, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v69, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v71, 0x40c00000, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v66, v70, v81, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v71, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v69, v70, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v71, v80, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v80
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v81, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v69, v70, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v71, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v69, v71, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v71
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v70, v81, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v80, 0x40c00000, v80
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v66
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v69, v82, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v71, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v80, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v80
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v71, v82 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v8
-; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v80, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v81, 0x40c00000, v81
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v70, v83, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v81, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v80, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v71, v81, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v81
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v69
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v83, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v70, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v80, v81, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v82, 0x40c00000, v82
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v71, v84, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v81, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v82, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v82
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v81, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v10
-; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v80, v82, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v82, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v70, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v80, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v71, v80, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v81, v82, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v82
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v83, 0x40c00000, v83
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v9, 16, 1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v80, v85, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v83, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v82, v9, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v9
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v81, v83, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v83
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v82, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v83, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v71, v80, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v81, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v82, v83, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v83
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v8, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v10, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v81, v86, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v84, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v10
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v83, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v84
-; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v82, v84, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v65, 16, v67
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v83, v86, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v84, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v80, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v82, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v81, v82, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v83, v84, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v84
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v9, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v85, 0x40c00000, v85
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v11, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v82, v87, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v85, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v85, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v81, v82, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v83, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v82, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v84, v85, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v10, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v84, v11, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v83, v85, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v64, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v55, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v30
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v83, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v85, v86, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v81
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v82, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v84, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v7, 16, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v83, v84, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v85, v86, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v86
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v85, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v86, 0x40c00000, v96
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v85, v85, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v83, v84, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v87, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v85, 0x40c00000, v85
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v84, v96, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v86, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v82
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v86, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v86
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v84, v87, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v85, v96, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT:    v_add3_u32 v87, v99, v86, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v86
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v6, 16, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v84, v97, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v66
-; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v84, v86, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v96, v12, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v85, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v27
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v84, v87, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v52, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v50, 16, v65
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v86, v87, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v97, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v96, v85, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v96, 0x40c00000, v98
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v85
-; GFX11-TRUE16-NEXT:    v_add3_u32 v97, v97, v13, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v83
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v87, v96, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v15
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v96, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v96
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v103, v98, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v112, v15, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v99, v99, v96, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v113, 0x400000, v98
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v99, v101, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v99, v102, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v5, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v70
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_add_f32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v5, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v101, v96, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v102, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v113, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v114, 0x400000, v96
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v17, 16, v69
+; GFX11-TRUE16-NEXT:    v_add3_u32 v101, v101, v96, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v102, v102, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v66.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v27, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v51.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v66
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v98, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v97, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v27, 16, v51
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v38.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v85, v98, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v87, 0x40c00000, v98
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v98, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v34.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v87, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v98, v98, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v103, 0x400000, v87
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v99, v99, v87, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v100, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v14
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_add3_u32 v102, v103, v98, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v103, v112, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v96
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v99, v101, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v100, v100, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v100, v112, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v83.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v14.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v103, v112, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v102, v113, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v14.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v102, v113, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v15.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v101, v114, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v96
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v99, v103, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v99, 0x400000, v13
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v53, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v98
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v97, v100, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v15.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v48, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v39, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v86, v87, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v12.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v13, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v12, 16, v96
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v85
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v87, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v86, 16, v84
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v83, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v82
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v81
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v80
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v7, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v32, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v48
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v3, 16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v87
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v98, v99, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v4, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v85, v97, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v86
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v12.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v84
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v3, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v80
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v4, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v71
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v3, 16, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v4, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v68
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v64.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v17, 16, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v54.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v22, 16, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v52.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v17, 16, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v49.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v22, 16, v52
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v39.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v17, 16, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v36.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v22, 16, v39
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v17, 16, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v36
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v37, 16, v38
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v39, 16, v16
 ; GFX11-TRUE16-NEXT:  .LBB51_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index b040e77..c0577b1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -9418,78 +9418,80 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_cndmask_b32 v1, v7, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v10, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v12, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v14, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v12, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v12, v15, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v9, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v6, 16, v5
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index 1db0ccc..27d32fc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -121,11 +121,11 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_2
-; GCN-NEXT:  ; %bb.1:
+; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:  .LBB1_2:
+; GCN-NEXT:  .LBB1_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index edeb780..cc32c19 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -14943,149 +14943,156 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v1, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v9, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v14, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v15, v9, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v12, v13 :: v_dual_and_b32 v13, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v13, v15, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v10, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v14, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v9
-; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v8, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v12, v14, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v15, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v12, v13, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_add_f32 v12, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v14, v16, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v13
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v13, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v12, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v14, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v14, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v15, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v15, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v11, v18, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v21
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_cndmask_b32 v13, v16, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v21 :: v_dual_cndmask_b32 v11, v16, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v20, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v19, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v20, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v16, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v22, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v23, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v11, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v20, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v24, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v21, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v22, v25, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v14, v17, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v6, 16, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v5, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v4, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v19, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v0, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v15, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v1, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v2, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v1, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v11, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v12, 16, v8
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 9d2601e..5f21bdc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -31415,288 +31415,300 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v26, 16, v7
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v1 :: v_dual_lshlrev_b32 v34, 16, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v0 :: v_dual_lshlrev_b32 v19, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v16, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v16
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v16, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v19
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v16, v1, v22 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v23, v17, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v24, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v15, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v0, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v17
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v20, v21, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v2 :: v_dual_cndmask_b32 v19, v23, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v0, v0, v22
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v22, v16, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v20, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v2, v21, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v18, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v20, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_lshlrev_b32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v18, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v20, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v22, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v20, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v21, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v21, v23, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v22, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v22
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v22, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v20, v24, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v23
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v22, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v22, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v20, v24, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v23, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v23, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v21, v21, v25 :: v_dual_add_f32 v24, 0x40c00000, v24
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v21, v22 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_add_f32 v5, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v23, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v23, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v22, v25 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v24, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v20, v26, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v22, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v24, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v24, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v24, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v23, v26 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v25, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v25, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v23, v24 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v25, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v25, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v24, v27, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_lshlrev_b32 v27, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v26, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v23, v28, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v24, v25 :: v_dual_and_b32 v8, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v26, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v24, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v26, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v26, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, 0x400000, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v25, v28 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v27, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v9, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v27, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v9
-; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v25, v26 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v27, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v27, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v26, v29, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v29, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v28, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v10, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v25, v30, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v26, v27 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v29, 0x40c00000, v29 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v28, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v26, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v28, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v28, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v27, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v27, v30 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v29, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v11, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v27, v29, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v26, v31, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v11
-; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v28, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v27, v28 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v29, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v6, 16, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v27, v28, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v29, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v11
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v28, v31, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_lshlrev_b32 v31, 16, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v30, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v27, v32, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v28, v29 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v31, 0x40c00000, v31 :: v_dual_add_f32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v27
+; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v30, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v28, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v5, 16, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v29, v32, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v28, v30, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v29, v12, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v12, v29, v32 :: v_dual_add_f32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v31, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v28, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v31, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v30, v13, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v29, v31, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v34, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v31, v35 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v32, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v30, v35, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v36, v32, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v29
+; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v31, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v5, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v16, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v4, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v32, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v34, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v11, 16, v27
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v4, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v21, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v37, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v37, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v30, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v37, v48, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v17, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v16, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v15.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v36, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v34.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v31, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v30, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v29, v33, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v14, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v36, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v14.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v35, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v28, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v1, 16, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v3, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v3, 16, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v16, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v18, 16, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v19
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 42b2f9a..8fa9b3c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -6719,46 +6719,47 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 852114f..4ae7c88 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -6675,66 +6675,64 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v5, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v9, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v6, v8 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v12, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v9, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v8, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v8, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v0, 16, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v5, 16, v3
 ; GFX11-TRUE16-NEXT:  .LBB26_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/asm-printer-check-vcc.mir b/llvm/test/CodeGen/AMDGPU/asm-printer-check-vcc.mir
index c589c10..813b223 100644
--- a/llvm/test/CodeGen/AMDGPU/asm-printer-check-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/asm-printer-check-vcc.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-after=livedebugvalues -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-asm-printer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: foo:
 # GCN: s_getpc_b64 vcc
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 8ea5c3e..f153b30 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -4,7 +4,6 @@
 # Test coalescing situations which can use av_* registers to handle
 # copies between VGPRs and AGPRs.
 
-
 # Should coalesce %0 and %1 into subregisters of the av_64 common
 # class
 ---
@@ -517,3 +516,2259 @@ body:             |
     SI_RETURN
 
 ...
+
+# Should coalesce %0 and %1 into subregisters of the av_64 common
+# class
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    %0.sub1:vreg_64 = COPY $vgpr1
+    undef %2.sub0:areg_64 = COPY %0.sub0
+    %2.sub1:areg_64 = COPY %0.sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    %0.sub1:vreg_64 = COPY $vgpr1
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1:vreg_96 = COPY $vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %3.sub0:areg_96 = COPY %0.sub0
+    %3.sub1:areg_96 = COPY %0.sub1
+    %3.sub2:areg_96 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1:vreg_96 = COPY $vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %3.sub0:areg_96_align2 = COPY %0.sub0
+    %3.sub1:areg_96_align2 = COPY %0.sub1
+    %3.sub2:areg_96_align2 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_sub
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+    %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+    %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %2
+    SI_RETURN
+
+...
+
+
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
+    %0.sub1:vreg_128 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
+    %2.sub2_sub3:areg_128_align2 = COPY %0.sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8, $sgpr9
+
+    ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_sub
+    ; CHECK: liveins: $sgpr8, $sgpr9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:sreg_64 = COPY $sgpr8
+    %0.sub1:sreg_64 = COPY $sgpr9
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96 = COPY %0.sub0
+    %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96_align2 = COPY %0.sub0
+    %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+    %2.sub2:areg_96 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+    %2.sub2:areg_96_align2 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_sub
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %2.sub0:areg_64 = COPY %0.sub0
+    %2.sub1:areg_64 = COPY %0.sub0
+    INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_sub
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_96 = COPY %0.sub0
+    %1.sub1:areg_96 = COPY %0.sub0
+    %1.sub2:areg_96 = COPY %0.sub0
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_sub
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_96_align2 = COPY %0.sub0
+    %1.sub1:areg_96_align2 = COPY %0.sub0
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_sub
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_128 = COPY %0.sub0
+    %1.sub1:areg_128 = COPY %0.sub0
+    %1.sub2:areg_128 = COPY %0.sub0
+    %1.sub3:areg_128 = COPY %0.sub0
+    INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_align2_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_sub
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_128_align2 = COPY %0.sub0
+    %1.sub1:areg_128_align2 = COPY %0.sub0
+    %1.sub2:areg_128_align2 = COPY %0.sub0
+    %1.sub3:areg_128_align2 = COPY %0.sub0
+    INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    %0.sub1:vreg_64 = COPY $vgpr1
+    undef %2.sub0:areg_64 = COPY %0.sub0
+    %2.sub1:areg_64 = COPY %0.sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64_align2 = COPY $vgpr0
+    %0.sub1:vreg_64_align2 = COPY $vgpr1
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 = COPY $vgpr0
+    %0.sub1:vreg_96 = COPY $vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %3.sub0:areg_96 = COPY %0.sub0
+    %3.sub1:areg_96 = COPY %0.sub1
+    %3.sub2:areg_96 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96_align2 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+    %0.sub1:vreg_96_align2 = COPY $vgpr1
+    %0.sub2:vreg_96_align2 = COPY $vgpr2
+    undef %3.sub0:areg_96_align2 = COPY %0.sub0
+    %3.sub1:areg_96_align2 = COPY %0.sub1
+    %3.sub2:areg_96_align2 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+    %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+    %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+    %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
+    %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8, $sgpr9
+
+    ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_both_sub
+    ; CHECK: liveins: $sgpr8, $sgpr9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:sreg_64 = COPY $sgpr8
+    %0.sub1:sreg_64 = COPY $sgpr9
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 = COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96 = COPY %0.sub0
+    %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+    SI_RETURN
+
+...
+
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_shuffle_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 = COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96 = COPY %0.sub2
+    %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+    SI_RETURN
+
+...
+
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+    %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96_align2 = COPY %0.sub0
+    %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+    %2.sub2:areg_96 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96_align2 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+    %2.sub2:areg_96_align2 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+    %2.sub2:areg_96_align2 = COPY %0.sub2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %2:areg_64 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+    %2:areg_64_align2 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    %3:areg_96 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+    %3:areg_96_align2 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %2:areg_128 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %2:areg_128_align2 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8, $sgpr9
+
+    ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_whole_reg
+    ; CHECK: liveins: $sgpr8, $sgpr9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:sreg_64 = COPY $sgpr8_sgpr9
+    %2:areg_64_align2 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    %2:areg_96_align2 = COPY %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    undef %2.sub0:areg_64 = COPY %0
+    %2.sub1:areg_64 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    undef %2.sub0:areg_64_align2 = COPY %0
+    %2.sub1:areg_64_align2 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+    undef %3.sub0:areg_96 = COPY %0
+    %3.sub1:areg_96 = COPY %1
+    %3.sub2:areg_96 = COPY %2
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+    undef %3.sub0:areg_96_align2 = COPY %0
+    %3.sub1:areg_96_align2 = COPY %1
+    %3.sub2:areg_96_align2 = COPY %2
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vreg_64 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128 = COPY %0
+    %2.sub2_sub3:areg_128 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vreg_64 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128_align2 = COPY %0
+    %2.sub2_sub3:areg_128_align2 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8, $sgpr9
+
+    ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_snop
+    ; CHECK: liveins: $sgpr8, $sgpr9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:sgpr_32 = COPY $sgpr8
+    %1:sgpr_32 = COPY $sgpr9
+    undef %2.sub0:areg_64_align2 = COPY %0
+    %2.sub1:areg_64_align2 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vreg_64 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96 = COPY %0
+    %2.sub1_sub2:areg_96 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vreg_64 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96_align2 = COPY %0
+    %2.sub1_sub2:areg_96_align2 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96 = COPY %0
+    %2.sub2:areg_96 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96_align2 = COPY %0
+    %2.sub2:areg_96_align2 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    undef %2.sub0:areg_64 = COPY %0
+    %2.sub1:areg_64 = COPY %0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    undef %2.sub0:areg_64_align2 = COPY %0
+    %2.sub1:areg_64_align2 = COPY %1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    undef %1.sub0:areg_96 = COPY %0
+    %1.sub1:areg_96 = COPY %0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    undef %1.sub0:areg_96_align2 = COPY %0
+    %1.sub1:areg_96_align2 = COPY %0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    undef %1.sub0:areg_128 = COPY %0
+    %1.sub1:areg_128 = COPY %0
+    %1.sub2:areg_128 = COPY %0
+    %1.sub3:areg_128 = COPY %0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_align2_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vgpr_32 = COPY $vgpr0
+    undef %1.sub0:areg_128_align2 = COPY %0
+    %1.sub1:areg_128_align2 = COPY %0
+    %1.sub2:areg_128_align2 = COPY %0
+    %1.sub3:areg_128_align2 = COPY %0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    %0.sub1:vreg_64 = COPY $vgpr1
+    undef %2.sub0:areg_64 = COPY %0.sub0
+    %2.sub1:areg_64 = COPY %0.sub1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    %0.sub1:vreg_64 = COPY $vgpr1
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1:vreg_96 = COPY $vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %3.sub0:areg_96 = COPY %0.sub0
+    %3.sub1:areg_96 = COPY %0.sub1
+    %3.sub2:areg_96 = COPY %0.sub2
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1:vreg_96 = COPY $vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %3.sub0:areg_96_align2 = COPY %0.sub0
+    %3.sub1:areg_96_align2 = COPY %0.sub1
+    %3.sub2:areg_96_align2 = COPY %0.sub2
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+    %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+    %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+    %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
+    %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8, $sgpr9
+
+    ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_sub_snop
+    ; CHECK: liveins: $sgpr8, $sgpr9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:sreg_64 = COPY $sgpr8
+    %0.sub1:sreg_64 = COPY $sgpr9
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96 = COPY %0.sub0
+    %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 =COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96_align2 = COPY %0.sub0
+    %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+    %2.sub2:areg_96 = COPY %0.sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+    %2.sub2:areg_96_align2 = COPY %0.sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_sub_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %2.sub0:areg_64 = COPY %0.sub0
+    %2.sub1:areg_64 = COPY %0.sub0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_sub_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_96 = COPY %0.sub0
+    %1.sub1:areg_96 = COPY %0.sub0
+    %1.sub2:areg_96 = COPY %0.sub0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_sub_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_96_align2 = COPY %0.sub0
+    %1.sub1:areg_96_align2 = COPY %0.sub0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_sub_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_128 = COPY %0.sub0
+    %1.sub1:areg_128 = COPY %0.sub0
+    %1.sub2:areg_128 = COPY %0.sub0
+    %1.sub3:areg_128 = COPY %0.sub0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_align2_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_sub_snop
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    undef %1.sub0:areg_128_align2 = COPY %0.sub0
+    %1.sub1:areg_128_align2 = COPY %0.sub0
+    %1.sub2:areg_128_align2 = COPY %0.sub0
+    %1.sub3:areg_128_align2 = COPY %0.sub0
+    S_NOP 0, implicit %1
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64 = COPY $vgpr0
+    %0.sub1:vreg_64 = COPY $vgpr1
+    undef %2.sub0:areg_64 = COPY %0.sub0
+    %2.sub1:areg_64 = COPY %0.sub1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_64_align2 = COPY $vgpr0
+    %0.sub1:vreg_64_align2 = COPY $vgpr1
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 = COPY $vgpr0
+    %0.sub1:vreg_96 = COPY $vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %3.sub0:areg_96 = COPY %0.sub0
+    %3.sub1:areg_96 = COPY %0.sub1
+    %3.sub2:areg_96 = COPY %0.sub2
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96_align2 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+    %0.sub1:vreg_96_align2 = COPY $vgpr1
+    %0.sub2:vreg_96_align2 = COPY $vgpr2
+    undef %3.sub0:areg_96_align2 = COPY %0.sub0
+    %3.sub1:areg_96_align2 = COPY %0.sub1
+    %3.sub2:areg_96_align2 = COPY %0.sub2
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+    %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+    %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+    %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
+    undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
+    %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8, $sgpr9
+
+    ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_both_sub_snop
+    ; CHECK: liveins: $sgpr8, $sgpr9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:sreg_64 = COPY $sgpr8
+    %0.sub1:sreg_64 = COPY $sgpr9
+    undef %2.sub0:areg_64_align2 = COPY %0.sub0
+    %2.sub1:areg_64_align2 = COPY %0.sub1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 = COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96 = COPY %0.sub0
+    %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_shuffle_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:vreg_96  = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96  = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: S_NOP 0, implicit %1
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96 = COPY $vgpr0
+    %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96 = COPY %0.sub2
+    %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+    %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
+    undef %2.sub0:areg_96_align2 = COPY %0.sub0
+    %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+    %2.sub2:areg_96 = COPY %0.sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96_align2 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+    %2.sub2:areg_96_align2 = COPY %0.sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+    %0.sub2:vreg_96 = COPY $vgpr2
+    undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+    %2.sub2:areg_96_align2 = COPY %0.sub2
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %2:areg_64 = COPY %0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+    %2:areg_64_align2 = COPY %0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    %3:areg_96 = COPY %0
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+    %3:areg_96_align2 = COPY %0
+    S_NOP 0, implicit %3
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %2:areg_128 = COPY %0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg_snop
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %2:areg_128_align2 = COPY %0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr8, $sgpr9
+
+    ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_whole_reg_snop
+    ; CHECK: liveins: $sgpr8, $sgpr9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:sreg_64 = COPY $sgpr8_sgpr9
+    %2:areg_64_align2 = COPY %0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg_snop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg_snop
+    ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: SI_RETURN
+    %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+    %2:areg_96_align2 = COPY %0
+    S_NOP 0, implicit %2
+    SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 6e24d9a..f9db082 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -9,8 +9,9 @@ body:             |
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
     ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
-    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
     %1:sreg_32 = IMPLICIT_DEF
@@ -28,8 +29,9 @@ body:             |
     ; GCN-LABEL: name: cvt_hi_f32_f16
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[SUBREG_TO_REG]]
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]]
     ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY]].hi16, 0, 0, 0, implicit $mode, implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
     %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
@@ -44,8 +46,9 @@ body:             |
     ; GCN-LABEL: name: s_or_b32
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
-    ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+    ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[REG_SEQUENCE]], [[REG_SEQUENCE]], implicit $exec
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
     %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll b/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll
index daa304e..2e75b90 100644
--- a/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll
@@ -10,9 +10,9 @@ define { float, i32 } @frexp_frexp(float %x) {
 ; CHECK-LABEL: frexp_frexp:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CHECK-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x)
   %frexp0.0 = extractvalue { float, i32 } %frexp0, 0
@@ -24,12 +24,12 @@ define { <2 x float>, <2 x i32> } @frexp_frexp_vector(<2 x float> %x) {
 ; CHECK-LABEL: frexp_frexp_vector:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_frexp_mant_f32_e32 v4, v0
-; CHECK-NEXT:    v_frexp_mant_f32_e32 v5, v1
-; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
-; CHECK-NEXT:    v_mov_b32_e32 v0, v4
-; CHECK-NEXT:    v_mov_b32_e32 v1, v5
+; CHECK-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; CHECK-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; CHECK-NEXT:    v_frexp_mant_f32_e32 v0, v2
+; CHECK-NEXT:    v_frexp_mant_f32_e32 v1, v3
+; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v3, v3
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %frexp0 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %x)
   %frexp0.0 = extractvalue { <2 x float>, <2 x i32> } %frexp0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index c7f9ec8..2d4c881 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -34,19 +34,11 @@ define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) {
 ; GFX7-GISEL-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: test_vector_reduce_smax_v2i8:
-; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: test_vector_reduce_smax_v2i8:
-; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: test_vector_reduce_smax_v2i8:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_vector_reduce_smax_v2i8:
 ; GFX9:       ; %bb.0: ; %entry
@@ -173,11 +165,8 @@ define i8 @test_vector_reduce_smax_v3i8(<3 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smax_v3i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v2
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, v0, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_smax_v3i8:
@@ -350,23 +339,20 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    s_sext_i32_i8 s4, s4
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, 0, v1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, s4, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
@@ -675,30 +661,23 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v5
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v2
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v1, v3
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, 0, v1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, s4, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
@@ -1135,46 +1114,31 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v8
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v9
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v10
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v11
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v12
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v4, sext(v4), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v13
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v5, sext(v5), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v6, 8, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v14
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v6, sext(v6), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v7, 8, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v15
-; GFX8-GISEL-NEXT:    v_max_i16_sdwa v7, sext(v7), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v2, sext(v2), sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v3, sext(v3), sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v4, sext(v4), sext(v12) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_max_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v1, v5
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, v2, v6
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v2
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, v1, v3
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_max_i16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, 0, v1
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v1, s4, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_max_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index f7ad431..d9d9a6b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -34,19 +34,11 @@ define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) {
 ; GFX7-GISEL-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: test_vector_reduce_smin_v2i8:
-; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: test_vector_reduce_smin_v2i8:
-; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: test_vector_reduce_smin_v2i8:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_vector_reduce_smin_v2i8:
 ; GFX9:       ; %bb.0: ; %entry
@@ -173,11 +165,8 @@ define i8 @test_vector_reduce_smin_v3i8(<3 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smin_v3i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v2
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, v0, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_smin_v3i8:
@@ -350,23 +339,20 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    s_sext_i32_i8 s4, s4
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, 0, v1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, s4, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
@@ -675,30 +661,23 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v5
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v7) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v2
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v1, v3
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, 0, v1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, s4, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
@@ -1135,46 +1114,31 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v8
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v9
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v10
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v11
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v12
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v4, sext(v4), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v13
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v5, sext(v5), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v6, 8, v6
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v14
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v6, sext(v6), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v7, 8, v7
-; GFX8-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 8, v15
-; GFX8-GISEL-NEXT:    v_min_i16_sdwa v7, sext(v7), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v0, sext(v0), sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v1, sext(v1), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v2, sext(v2), sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v3, sext(v3), sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v4, sext(v4), sext(v12) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v5, sext(v5), sext(v13) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v6, sext(v6), sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-GISEL-NEXT:    v_min_i16_sdwa v7, sext(v7), sext(v15) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v1, v5
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, v2, v6
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v2
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, v1, v3
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_min_i16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, 0, v1
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v1, s4, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, 0, v2
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v2, s4, v2
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, 0, v3
+; GFX8-GISEL-NEXT:    v_min_i16_e32 v3, s4, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
diff --git a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
index fde6cb7..ff9c748 100644
--- a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
+++ b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes
 ; RUN: opt -vector-library=SVML -replace-with-veclib -S < %s | FileCheck %s  --check-prefixes=COMMON,SVML
 ; RUN: opt -vector-library=AMDLIBM -replace-with-veclib -S < %s | FileCheck %s  --check-prefixes=COMMON,AMDLIBM
-; RUN: opt -vector-library=LIBMVEC-X86 -replace-with-veclib -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-X86
+; RUN: opt -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-X86
 ; RUN: opt -vector-library=MASSV -replace-with-veclib -S < %s | FileCheck %s  --check-prefixes=COMMON,MASSV
 ; RUN: opt -vector-library=Accelerate -replace-with-veclib -S < %s | FileCheck %s  --check-prefixes=COMMON,ACCELERATE
 
diff --git a/llvm/test/CodeGen/Hexagon/isel/pfalse-v4i1.ll b/llvm/test/CodeGen/Hexagon/isel/pfalse-v4i1.ll
index c0904b8..2c26bb1 100644
--- a/llvm/test/CodeGen/Hexagon/isel/pfalse-v4i1.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/pfalse-v4i1.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=hexagon -debug-only=isel 2>&1 < %s - | FileCheck %s
+; REQUIRES: asserts
 
 ; CHECK: [[R0:%[0-9]+]]:intregs = A2_tfrsi 0
 ; CHECK-NEXT: predregs = C2_tfrrp killed [[R0]]:intregs
diff --git a/llvm/test/CodeGen/LoongArch/addrspacecast.ll b/llvm/test/CodeGen/LoongArch/addrspacecast.ll
index b177e8f..d41c90b 100644
--- a/llvm/test/CodeGen/LoongArch/addrspacecast.ll
+++ b/llvm/test/CodeGen/LoongArch/addrspacecast.ll
@@ -24,7 +24,7 @@ define void @cast1(ptr %ptr) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    bl %plt(foo)
+; LA32-NEXT:    bl foo
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/alloca.ll b/llvm/test/CodeGen/LoongArch/alloca.ll
index effd7da..8a3b2ae 100644
--- a/llvm/test/CodeGen/LoongArch/alloca.ll
+++ b/llvm/test/CodeGen/LoongArch/alloca.ll
@@ -20,7 +20,7 @@ define void @simple_alloca(i32 %n) nounwind {
 ; LA32-NEXT:    bstrins.w $a0, $zero, 3, 0
 ; LA32-NEXT:    sub.w $a0, $sp, $a0
 ; LA32-NEXT:    move $sp, $a0
-; LA32-NEXT:    bl %plt(notdead)
+; LA32-NEXT:    bl notdead
 ; LA32-NEXT:    addi.w $sp, $fp, -16
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -67,7 +67,7 @@ define void @scoped_alloca(i32 %n) nounwind {
 ; LA32-NEXT:    bstrins.w $a0, $zero, 3, 0
 ; LA32-NEXT:    sub.w $a0, $sp, $a0
 ; LA32-NEXT:    move $sp, $a0
-; LA32-NEXT:    bl %plt(notdead)
+; LA32-NEXT:    bl notdead
 ; LA32-NEXT:    move $sp, $s0
 ; LA32-NEXT:    addi.w $sp, $fp, -16
 ; LA32-NEXT:    ld.w $s0, $sp, 4 # 4-byte Folded Reload
@@ -137,7 +137,7 @@ define void @alloca_callframe(i32 %n) nounwind {
 ; LA32-NEXT:    ori $a6, $zero, 7
 ; LA32-NEXT:    ori $a7, $zero, 8
 ; LA32-NEXT:    st.w $t0, $sp, 0
-; LA32-NEXT:    bl %plt(func)
+; LA32-NEXT:    bl func
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    addi.w $sp, $fp, -16
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/bnez-beqz.ll b/llvm/test/CodeGen/LoongArch/bnez-beqz.ll
index 93bbcbb..3b1daba 100644
--- a/llvm/test/CodeGen/LoongArch/bnez-beqz.ll
+++ b/llvm/test/CodeGen/LoongArch/bnez-beqz.ll
@@ -11,7 +11,7 @@ define void @bnez_i32(i32 signext %0) nounwind {
 ; LA32-NEXT:  # %bb.1: # %f
 ; LA32-NEXT:    ret
 ; LA32-NEXT:  .LBB0_2: # %t
-; LA32-NEXT:    b %plt(bar)
+; LA32-NEXT:    b bar
 ;
 ; LA64-LABEL: bnez_i32:
 ; LA64:       # %bb.0: # %start
@@ -38,7 +38,7 @@ define void @beqz_i32(i32 signext %0) nounwind {
 ; LA32:       # %bb.0: # %start
 ; LA32-NEXT:    beqz $a0, .LBB1_2
 ; LA32-NEXT:  # %bb.1: # %t
-; LA32-NEXT:    b %plt(bar)
+; LA32-NEXT:    b bar
 ; LA32-NEXT:  .LBB1_2: # %f
 ; LA32-NEXT:    ret
 ;
@@ -70,7 +70,7 @@ define void @bnez_i64(i64 %0) nounwind {
 ; LA32-NEXT:  # %bb.1: # %f
 ; LA32-NEXT:    ret
 ; LA32-NEXT:  .LBB2_2: # %t
-; LA32-NEXT:    b %plt(bar)
+; LA32-NEXT:    b bar
 ;
 ; LA64-LABEL: bnez_i64:
 ; LA64:       # %bb.0: # %start
@@ -98,7 +98,7 @@ define void @beqz_i64(i64 %0) nounwind {
 ; LA32-NEXT:    or $a0, $a0, $a1
 ; LA32-NEXT:    beqz $a0, .LBB3_2
 ; LA32-NEXT:  # %bb.1: # %t
-; LA32-NEXT:    b %plt(bar)
+; LA32-NEXT:    b bar
 ; LA32-NEXT:  .LBB3_2: # %f
 ; LA32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index c012068..f445965 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -14,7 +14,7 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; SMALL:       # %bb.0:
 ; SMALL-NEXT:    addi.d $sp, $sp, -16
 ; SMALL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; SMALL-NEXT:    bl %plt(callee)
+; SMALL-NEXT:    bl callee
 ; SMALL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; SMALL-NEXT:    addi.d $sp, $sp, 16
 ; SMALL-NEXT:    ret
@@ -55,7 +55,7 @@ define void @call_external_sym(ptr %dst) {
 ; SMALL-NEXT:    .cfi_offset 1, -8
 ; SMALL-NEXT:    ori $a2, $zero, 1000
 ; SMALL-NEXT:    move $a1, $zero
-; SMALL-NEXT:    bl %plt(memset)
+; SMALL-NEXT:    bl memset
 ; SMALL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; SMALL-NEXT:    addi.d $sp, $sp, 16
 ; SMALL-NEXT:    ret
@@ -101,7 +101,7 @@ declare i32 @callee_tail(i32 %i)
 define i32 @caller_tail(i32 %i) nounwind {
 ; SMALL-LABEL: caller_tail:
 ; SMALL:       # %bb.0: # %entry
-; SMALL-NEXT:    b %plt(callee_tail)
+; SMALL-NEXT:    b callee_tail
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/LoongArch/double-br-fcmp.ll b/llvm/test/CodeGen/LoongArch/double-br-fcmp.ll
index 6a5b856..cb89bcd 100644
--- a/llvm/test/CodeGen/LoongArch/double-br-fcmp.ll
+++ b/llvm/test/CodeGen/LoongArch/double-br-fcmp.ll
@@ -14,7 +14,7 @@ define void @br_fcmp_oeq_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB0_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oeq_bcnez:
 ; LA64:       # %bb.0:
@@ -46,7 +46,7 @@ define void @br_fcmp_oeq_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB1_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oeq_bceqz:
 ; LA64:       # %bb.0:
@@ -78,7 +78,7 @@ define void @br_fcmp_ogt_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB2_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ogt_bcnez:
 ; LA64:       # %bb.0:
@@ -110,7 +110,7 @@ define void @br_fcmp_ogt_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB3_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ogt_bceqz:
 ; LA64:       # %bb.0:
@@ -142,7 +142,7 @@ define void @br_fcmp_oge_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB4_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oge_bcnez:
 ; LA64:       # %bb.0:
@@ -174,7 +174,7 @@ define void @br_fcmp_oge_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB5_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oge_bceqz:
 ; LA64:       # %bb.0:
@@ -206,7 +206,7 @@ define void @br_fcmp_olt_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB6_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_olt_bcnez:
 ; LA64:       # %bb.0:
@@ -238,7 +238,7 @@ define void @br_fcmp_olt_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB7_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_olt_bceqz:
 ; LA64:       # %bb.0:
@@ -270,7 +270,7 @@ define void @br_fcmp_ole_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB8_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ole_bcnez:
 ; LA64:       # %bb.0:
@@ -302,7 +302,7 @@ define void @br_fcmp_ole_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB9_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ole_bceqz:
 ; LA64:       # %bb.0:
@@ -334,7 +334,7 @@ define void @br_fcmp_one_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB10_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_one_bcnez:
 ; LA64:       # %bb.0:
@@ -366,7 +366,7 @@ define void @br_fcmp_one_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB11_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_one_bceqz:
 ; LA64:       # %bb.0:
@@ -398,7 +398,7 @@ define void @br_fcmp_ord_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB12_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ord_bcnez:
 ; LA64:       # %bb.0:
@@ -430,7 +430,7 @@ define void @br_fcmp_ord_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB13_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ord_bceqz:
 ; LA64:       # %bb.0:
@@ -462,7 +462,7 @@ define void @br_fcmp_ueq_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB14_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ueq_bcnez:
 ; LA64:       # %bb.0:
@@ -494,7 +494,7 @@ define void @br_fcmp_ueq_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB15_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ueq_bceqz:
 ; LA64:       # %bb.0:
@@ -526,7 +526,7 @@ define void @br_fcmp_ugt_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB16_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ugt_bcnez:
 ; LA64:       # %bb.0:
@@ -558,7 +558,7 @@ define void @br_fcmp_ugt_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB17_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ugt_bceqz:
 ; LA64:       # %bb.0:
@@ -590,7 +590,7 @@ define void @br_fcmp_uge_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB18_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uge_bcnez:
 ; LA64:       # %bb.0:
@@ -622,7 +622,7 @@ define void @br_fcmp_uge_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB19_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uge_bceqz:
 ; LA64:       # %bb.0:
@@ -654,7 +654,7 @@ define void @br_fcmp_ult_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB20_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ult_bcnez:
 ; LA64:       # %bb.0:
@@ -686,7 +686,7 @@ define void @br_fcmp_ult_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB21_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ult_bceqz:
 ; LA64:       # %bb.0:
@@ -718,7 +718,7 @@ define void @br_fcmp_ule_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB22_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ule_bcnez:
 ; LA64:       # %bb.0:
@@ -750,7 +750,7 @@ define void @br_fcmp_ule_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB23_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ule_bceqz:
 ; LA64:       # %bb.0:
@@ -782,7 +782,7 @@ define void @br_fcmp_une_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB24_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_une_bcnez:
 ; LA64:       # %bb.0:
@@ -814,7 +814,7 @@ define void @br_fcmp_une_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB25_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_une_bceqz:
 ; LA64:       # %bb.0:
@@ -846,7 +846,7 @@ define void @br_fcmp_uno_bcnez(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB26_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uno_bcnez:
 ; LA64:       # %bb.0:
@@ -878,7 +878,7 @@ define void @br_fcmp_uno_bceqz(double %a, double %b) nounwind {
 ; LA32-NEXT:  .LBB27_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uno_bceqz:
 ; LA64:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/eh-dwarf-cfa.ll b/llvm/test/CodeGen/LoongArch/eh-dwarf-cfa.ll
index f23c536..224755e 100644
--- a/llvm/test/CodeGen/LoongArch/eh-dwarf-cfa.ll
+++ b/llvm/test/CodeGen/LoongArch/eh-dwarf-cfa.ll
@@ -10,7 +10,7 @@ define void @dwarf() {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 16
-; LA32-NEXT:    bl %plt(foo)
+; LA32-NEXT:    bl foo
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/exception-pointer-register.ll b/llvm/test/CodeGen/LoongArch/exception-pointer-register.ll
index 91fa34a..11cd573 100644
--- a/llvm/test/CodeGen/LoongArch/exception-pointer-register.ll
+++ b/llvm/test/CodeGen/LoongArch/exception-pointer-register.ll
@@ -28,13 +28,13 @@ define void @caller(ptr %p) personality ptr @__gxx_personality_v0 {
 ; LA32-NEXT:  # %bb.1: # %bb2
 ; LA32-NEXT:  .Ltmp0:
 ; LA32-NEXT:    move $a0, $fp
-; LA32-NEXT:    bl %plt(bar)
+; LA32-NEXT:    bl bar
 ; LA32-NEXT:  .Ltmp1:
 ; LA32-NEXT:    b .LBB0_3
 ; LA32-NEXT:  .LBB0_2: # %bb1
 ; LA32-NEXT:  .Ltmp2:
 ; LA32-NEXT:    move $a0, $fp
-; LA32-NEXT:    bl %plt(foo)
+; LA32-NEXT:    bl foo
 ; LA32-NEXT:  .Ltmp3:
 ; LA32-NEXT:  .LBB0_3: # %end2
 ; LA32-NEXT:    ld.w $s0, $sp, 4 # 4-byte Folded Reload
@@ -48,7 +48,7 @@ define void @caller(ptr %p) personality ptr @__gxx_personality_v0 {
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl callee
 ; LA32-NEXT:    move $a0, $s0
-; LA32-NEXT:    bl %plt(_Unwind_Resume)
+; LA32-NEXT:    bl _Unwind_Resume
 ;
 ; LA64-LABEL: caller:
 ; LA64:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll
index 50f2d21..63c26bd 100644
--- a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll
+++ b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll
@@ -9,15 +9,15 @@
 define float @fdiv_s(float %x, float %y) {
 ; LA32F-LABEL: fdiv_s:
 ; LA32F:       # %bb.0:
-; LA32F-NEXT:    fdiv.s	$fa0, $fa0, $fa1
+; LA32F-NEXT:    fdiv.s $fa0, $fa0, $fa1
 ; LA32F-NEXT:    ret
 ;
 ; LA32F-FRECIPE-LABEL: fdiv_s:
 ; LA32F-FRECIPE:       # %bb.0:
-; LA32F-FRECIPE-NEXT:    frecipe.s	$fa2, $fa1
-; LA32F-FRECIPE-NEXT:    fmul.s	$fa3, $fa0, $fa2
-; LA32F-FRECIPE-NEXT:    fnmsub.s	$fa0, $fa1, $fa3, $fa0
-; LA32F-FRECIPE-NEXT:    fmadd.s	$fa0, $fa2, $fa0, $fa3
+; LA32F-FRECIPE-NEXT:    frecipe.s $fa2, $fa1
+; LA32F-FRECIPE-NEXT:    fmul.s $fa3, $fa0, $fa2
+; LA32F-FRECIPE-NEXT:    fnmsub.s $fa0, $fa1, $fa3, $fa0
+; LA32F-FRECIPE-NEXT:    fmadd.s $fa0, $fa2, $fa0, $fa3
 ; LA32F-FRECIPE-NEXT:    ret
 ;
 ; LA64D-LABEL: fdiv_s:
@@ -27,10 +27,10 @@ define float @fdiv_s(float %x, float %y) {
 ;
 ; LA64D-FRECIPE-LABEL: fdiv_s:
 ; LA64D-FRECIPE:       # %bb.0:
-; LA64D-FRECIPE-NEXT:    frecipe.s	$fa2, $fa1
-; LA64D-FRECIPE-NEXT:    fmul.s	$fa3, $fa0, $fa2
-; LA64D-FRECIPE-NEXT:    fnmsub.s	$fa0, $fa1, $fa3, $fa0
-; LA64D-FRECIPE-NEXT:    fmadd.s	$fa0, $fa2, $fa0, $fa3
+; LA64D-FRECIPE-NEXT:    frecipe.s $fa2, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.s $fa3, $fa0, $fa2
+; LA64D-FRECIPE-NEXT:    fnmsub.s $fa0, $fa1, $fa3, $fa0
+; LA64D-FRECIPE-NEXT:    fmadd.s $fa0, $fa2, $fa0, $fa3
 ; LA64D-FRECIPE-NEXT:    ret
   %div = fdiv fast float %x, %y
   ret float %div
@@ -39,24 +39,24 @@ define float @fdiv_s(float %x, float %y) {
 define double @fdiv_d(double %x, double %y) {
 ; LA32F-LABEL: fdiv_d:
 ; LA32F:       # %bb.0:
-; LA32F-NEXT:    addi.w	$sp, $sp, -16
+; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
-; LA32F-NEXT:    st.w	$ra, $sp, 12                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl	%plt(__divdf3)
-; LA32F-NEXT:    ld.w	$ra, $sp, 12                    # 4-byte Folded Reload
-; LA32F-NEXT:    addi.w	$sp, $sp, 16
+; LA32F-NEXT:    bl __divdf3
+; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
 ;
 ; LA32F-FRECIPE-LABEL: fdiv_d:
 ; LA32F-FRECIPE:       # %bb.0:
-; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, -16
+; LA32F-FRECIPE-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-FRECIPE-NEXT:    .cfi_def_cfa_offset 16
-; LA32F-FRECIPE-NEXT:    st.w	$ra, $sp, 12                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-FRECIPE-NEXT:    .cfi_offset 1, -4
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
-; LA32F-FRECIPE-NEXT:    ld.w	$ra, $sp, 12                    # 4-byte Folded Reload
-; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, 16
+; LA32F-FRECIPE-NEXT:    bl __divdf3
+; LA32F-FRECIPE-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-FRECIPE-NEXT:    ret
 ;
 ; LA64D-LABEL: fdiv_d:
diff --git a/llvm/test/CodeGen/LoongArch/float-br-fcmp.ll b/llvm/test/CodeGen/LoongArch/float-br-fcmp.ll
index 316cd7c..a761bff 100644
--- a/llvm/test/CodeGen/LoongArch/float-br-fcmp.ll
+++ b/llvm/test/CodeGen/LoongArch/float-br-fcmp.ll
@@ -14,7 +14,7 @@ define void @br_fcmp_oeq_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB0_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oeq_bcnez_float:
 ; LA64:       # %bb.0:
@@ -46,7 +46,7 @@ define void @br_fcmp_oeq_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB1_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oeq_bceqz_float:
 ; LA64:       # %bb.0:
@@ -78,7 +78,7 @@ define void @br_fcmp_ogt_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB2_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ogt_bcnez_float:
 ; LA64:       # %bb.0:
@@ -110,7 +110,7 @@ define void @br_fcmp_ogt_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB3_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ogt_bceqz_float:
 ; LA64:       # %bb.0:
@@ -142,7 +142,7 @@ define void @br_fcmp_oge_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB4_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oge_bcnez_float:
 ; LA64:       # %bb.0:
@@ -174,7 +174,7 @@ define void @br_fcmp_oge_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB5_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_oge_bceqz_float:
 ; LA64:       # %bb.0:
@@ -206,7 +206,7 @@ define void @br_fcmp_olt_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB6_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_olt_bcnez_float:
 ; LA64:       # %bb.0:
@@ -238,7 +238,7 @@ define void @br_fcmp_olt_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB7_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_olt_bceqz_float:
 ; LA64:       # %bb.0:
@@ -270,7 +270,7 @@ define void @br_fcmp_ole_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB8_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ole_bcnez_float:
 ; LA64:       # %bb.0:
@@ -302,7 +302,7 @@ define void @br_fcmp_ole_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB9_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ole_bceqz_float:
 ; LA64:       # %bb.0:
@@ -334,7 +334,7 @@ define void @br_fcmp_one_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB10_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_one_bcnez_float:
 ; LA64:       # %bb.0:
@@ -366,7 +366,7 @@ define void @br_fcmp_one_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB11_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_one_bceqz_float:
 ; LA64:       # %bb.0:
@@ -398,7 +398,7 @@ define void @br_fcmp_ord_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB12_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ord_bcnez_float:
 ; LA64:       # %bb.0:
@@ -430,7 +430,7 @@ define void @br_fcmp_ord_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB13_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ord_bceqz_float:
 ; LA64:       # %bb.0:
@@ -462,7 +462,7 @@ define void @br_fcmp_ueq_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB14_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ueq_bcnez_float:
 ; LA64:       # %bb.0:
@@ -494,7 +494,7 @@ define void @br_fcmp_ueq_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB15_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ueq_bceqz_float:
 ; LA64:       # %bb.0:
@@ -526,7 +526,7 @@ define void @br_fcmp_ugt_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB16_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ugt_bcnez_float:
 ; LA64:       # %bb.0:
@@ -558,7 +558,7 @@ define void @br_fcmp_ugt_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB17_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ugt_bceqz_float:
 ; LA64:       # %bb.0:
@@ -590,7 +590,7 @@ define void @br_fcmp_uge_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB18_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uge_bcnez_float:
 ; LA64:       # %bb.0:
@@ -622,7 +622,7 @@ define void @br_fcmp_uge_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB19_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uge_bceqz_float:
 ; LA64:       # %bb.0:
@@ -654,7 +654,7 @@ define void @br_fcmp_ult_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB20_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ult_bcnez_float:
 ; LA64:       # %bb.0:
@@ -686,7 +686,7 @@ define void @br_fcmp_ult_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB21_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ult_bceqz_float:
 ; LA64:       # %bb.0:
@@ -718,7 +718,7 @@ define void @br_fcmp_ule_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB22_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ule_bcnez_float:
 ; LA64:       # %bb.0:
@@ -750,7 +750,7 @@ define void @br_fcmp_ule_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB23_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_ule_bceqz_float:
 ; LA64:       # %bb.0:
@@ -782,7 +782,7 @@ define void @br_fcmp_une_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB24_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_une_bcnez_float:
 ; LA64:       # %bb.0:
@@ -814,7 +814,7 @@ define void @br_fcmp_une_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB25_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_une_bceqz_float:
 ; LA64:       # %bb.0:
@@ -846,7 +846,7 @@ define void @br_fcmp_uno_bcnez_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB26_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uno_bcnez_float:
 ; LA64:       # %bb.0:
@@ -878,7 +878,7 @@ define void @br_fcmp_uno_bceqz_float(float %a, float %b) nounwind {
 ; LA32-NEXT:  .LBB27_2: # %if.then
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(abort)
+; LA32-NEXT:    bl abort
 ;
 ; LA64-LABEL: br_fcmp_uno_bceqz_float:
 ; LA64:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/fp-expand.ll b/llvm/test/CodeGen/LoongArch/fp-expand.ll
index 0939094..1eefdf2 100644
--- a/llvm/test/CodeGen/LoongArch/fp-expand.ll
+++ b/llvm/test/CodeGen/LoongArch/fp-expand.ll
@@ -14,7 +14,7 @@ declare double @llvm.pow.f64(double, double)
 define float @sin_f32(float %a) nounwind {
 ; LA32-LABEL: sin_f32:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(sinf)
+; LA32-NEXT:    b sinf
 ;
 ; LA64-LABEL: sin_f32:
 ; LA64:       # %bb.0:
@@ -27,7 +27,7 @@ define float @sin_f32(float %a) nounwind {
 define float @cos_f32(float %a) nounwind {
 ; LA32-LABEL: cos_f32:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(cosf)
+; LA32-NEXT:    b cosf
 ;
 ; LA64-LABEL: cos_f32:
 ; LA64:       # %bb.0:
@@ -45,10 +45,10 @@ define float @sincos_f32(float %a) nounwind {
 ; LA32-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
 ; LA32-NEXT:    fst.d $fs1, $sp, 8 # 8-byte Folded Spill
 ; LA32-NEXT:    fmov.s $fs0, $fa0
-; LA32-NEXT:    bl %plt(sinf)
+; LA32-NEXT:    bl sinf
 ; LA32-NEXT:    fmov.s $fs1, $fa0
 ; LA32-NEXT:    fmov.s $fa0, $fs0
-; LA32-NEXT:    bl %plt(cosf)
+; LA32-NEXT:    bl cosf
 ; LA32-NEXT:    fadd.s $fa0, $fs1, $fa0
 ; LA32-NEXT:    fld.d $fs1, $sp, 8 # 8-byte Folded Reload
 ; LA32-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
@@ -84,7 +84,7 @@ define float @sincos_f32(float %a) nounwind {
 define float @pow_f32(float %a, float %b) nounwind {
 ; LA32-LABEL: pow_f32:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(powf)
+; LA32-NEXT:    b powf
 ;
 ; LA64-LABEL: pow_f32:
 ; LA64:       # %bb.0:
@@ -97,7 +97,7 @@ define float @pow_f32(float %a, float %b) nounwind {
 define float @frem_f32(float %a, float %b) nounwind {
 ; LA32-LABEL: frem_f32:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(fmodf)
+; LA32-NEXT:    b fmodf
 ;
 ; LA64-LABEL: frem_f32:
 ; LA64:       # %bb.0:
@@ -110,7 +110,7 @@ define float @frem_f32(float %a, float %b) nounwind {
 define double @sin_f64(double %a) nounwind {
 ; LA32-LABEL: sin_f64:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(sin)
+; LA32-NEXT:    b sin
 ;
 ; LA64-LABEL: sin_f64:
 ; LA64:       # %bb.0:
@@ -123,7 +123,7 @@ define double @sin_f64(double %a) nounwind {
 define double @cos_f64(double %a) nounwind {
 ; LA32-LABEL: cos_f64:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(cos)
+; LA32-NEXT:    b cos
 ;
 ; LA64-LABEL: cos_f64:
 ; LA64:       # %bb.0:
@@ -141,10 +141,10 @@ define double @sincos_f64(double %a) nounwind {
 ; LA32-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
 ; LA32-NEXT:    fst.d $fs1, $sp, 8 # 8-byte Folded Spill
 ; LA32-NEXT:    fmov.d $fs0, $fa0
-; LA32-NEXT:    bl %plt(sin)
+; LA32-NEXT:    bl sin
 ; LA32-NEXT:    fmov.d $fs1, $fa0
 ; LA32-NEXT:    fmov.d $fa0, $fs0
-; LA32-NEXT:    bl %plt(cos)
+; LA32-NEXT:    bl cos
 ; LA32-NEXT:    fadd.d $fa0, $fs1, $fa0
 ; LA32-NEXT:    fld.d $fs1, $sp, 8 # 8-byte Folded Reload
 ; LA32-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
@@ -180,7 +180,7 @@ define double @sincos_f64(double %a) nounwind {
 define double @pow_f64(double %a, double %b) nounwind {
 ; LA32-LABEL: pow_f64:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(pow)
+; LA32-NEXT:    b pow
 ;
 ; LA64-LABEL: pow_f64:
 ; LA64:       # %bb.0:
@@ -193,7 +193,7 @@ define double @pow_f64(double %a, double %b) nounwind {
 define double @frem_f64(double %a, double %b) nounwind {
 ; LA32-LABEL: frem_f64:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    b %plt(fmod)
+; LA32-NEXT:    b fmod
 ;
 ; LA64-LABEL: frem_f64:
 ; LA64:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/fp-max-min.ll
index 1adf427..9bf3e6c 100644
--- a/llvm/test/CodeGen/LoongArch/fp-max-min.ll
+++ b/llvm/test/CodeGen/LoongArch/fp-max-min.ll
@@ -48,7 +48,7 @@ define double @maxnum_double(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fmax)
+; LA32F-NEXT:    bl fmax
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -121,7 +121,7 @@ define double @minnum_double(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fmin)
+; LA32F-NEXT:    bl fmin
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/LoongArch/fp-maximumnum-minimumnum.ll
index 607e50c..8718d61 100644
--- a/llvm/test/CodeGen/LoongArch/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/LoongArch/fp-maximumnum-minimumnum.ll
@@ -109,7 +109,7 @@ define double @maximumnum_double(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fmaximum_num)
+; LA32F-NEXT:    bl fmaximum_num
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -151,7 +151,7 @@ define double @maximumnum_double_nsz(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fmaximum_num)
+; LA32F-NEXT:    bl fmaximum_num
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -193,7 +193,7 @@ define double @maximumnum_double_nnan(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fmaximum_num)
+; LA32F-NEXT:    bl fmaximum_num
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -322,7 +322,7 @@ define double @minimumnum_double(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fminimum_num)
+; LA32F-NEXT:    bl fminimum_num
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -364,7 +364,7 @@ define double @minimumnum_double_nsz(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fminimum_num)
+; LA32F-NEXT:    bl fminimum_num
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -406,7 +406,7 @@ define double @minimumnum_double_nnan(double %x, double %y) {
 ; LA32F-NEXT:    .cfi_def_cfa_offset 16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    .cfi_offset 1, -4
-; LA32F-NEXT:    bl %plt(fminimum_num)
+; LA32F-NEXT:    bl fminimum_num
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/fp-reciprocal.ll b/llvm/test/CodeGen/LoongArch/fp-reciprocal.ll
index 04caf25..11e246e 100644
--- a/llvm/test/CodeGen/LoongArch/fp-reciprocal.ll
+++ b/llvm/test/CodeGen/LoongArch/fp-reciprocal.ll
@@ -38,7 +38,7 @@ define double @f64_reciprocal(double %a) nounwind {
 ; LA32F-NEXT:    move $a2, $a0
 ; LA32F-NEXT:    lu12i.w $a1, 261888
 ; LA32F-NEXT:    move $a0, $zero
-; LA32F-NEXT:    bl %plt(__divdf3)
+; LA32F-NEXT:    bl __divdf3
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/fp-trunc-store.ll b/llvm/test/CodeGen/LoongArch/fp-trunc-store.ll
index 2db3bdb..644f7cc 100644
--- a/llvm/test/CodeGen/LoongArch/fp-trunc-store.ll
+++ b/llvm/test/CodeGen/LoongArch/fp-trunc-store.ll
@@ -13,7 +13,7 @@ define void @fp_trunc(ptr %a, double %b) nounwind {
 ; LA32F-NEXT:    move $fp, $a0
 ; LA32F-NEXT:    move $a0, $a1
 ; LA32F-NEXT:    move $a1, $a2
-; LA32F-NEXT:    bl %plt(__truncdfsf2)
+; LA32F-NEXT:    bl __truncdfsf2
 ; LA32F-NEXT:    fst.s $fa0, $fp, 0
 ; LA32F-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/fp16-promote.ll b/llvm/test/CodeGen/LoongArch/fp16-promote.ll
index 61a3716..6a1610c 100644
--- a/llvm/test/CodeGen/LoongArch/fp16-promote.ll
+++ b/llvm/test/CodeGen/LoongArch/fp16-promote.ll
@@ -23,7 +23,7 @@ define float @test_fpextend_float(ptr %p) nounwind {
 ; LA32-LABEL: test_fpextend_float:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    ld.hu $a0, $a0, 0
-; LA32-NEXT:    b %plt(__extendhfsf2)
+; LA32-NEXT:    b __extendhfsf2
 ;
 ; LA64-LABEL: test_fpextend_float:
 ; LA64:       # %bb.0:
@@ -41,7 +41,7 @@ define double @test_fpextend_double(ptr %p) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ld.hu $a0, $a0, 0
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fcvt.d.s $fa0, $fa0
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
@@ -70,7 +70,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
 ; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    bl %plt(__truncsfhf2)
+; LA32-NEXT:    bl __truncsfhf2
 ; LA32-NEXT:    st.h $a0, $fp, 0
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -102,7 +102,7 @@ define void @test_fptrunc_double(double %d, ptr %p) nounwind {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
 ; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    bl %plt(__truncdfhf2)
+; LA32-NEXT:    bl __truncdfhf2
 ; LA32-NEXT:    st.h $a0, $fp, 0
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -136,12 +136,12 @@ define half @test_fadd_reg(half %a, half %b) nounwind {
 ; LA32-NEXT:    fst.d $fs0, $sp, 0 # 8-byte Folded Spill
 ; LA32-NEXT:    move $fp, $a0
 ; LA32-NEXT:    move $a0, $a1
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fmov.s $fs0, $fa0
 ; LA32-NEXT:    move $a0, $fp
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fadd.s $fa0, $fa0, $fs0
-; LA32-NEXT:    bl %plt(__truncsfhf2)
+; LA32-NEXT:    bl __truncsfhf2
 ; LA32-NEXT:    fld.d $fs0, $sp, 0 # 8-byte Folded Reload
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -185,12 +185,12 @@ define void @test_fadd_mem(ptr %p, ptr %q) nounwind {
 ; LA32-NEXT:    move $fp, $a0
 ; LA32-NEXT:    ld.hu $s0, $a0, 0
 ; LA32-NEXT:    ld.hu $a0, $a1, 0
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fmov.s $fs0, $fa0
 ; LA32-NEXT:    move $a0, $s0
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fadd.s $fa0, $fa0, $fs0
-; LA32-NEXT:    bl %plt(__truncsfhf2)
+; LA32-NEXT:    bl __truncsfhf2
 ; LA32-NEXT:    st.h $a0, $fp, 0
 ; LA32-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
 ; LA32-NEXT:    ld.w $s0, $sp, 20 # 4-byte Folded Reload
@@ -241,12 +241,12 @@ define half @test_fmul_reg(half %a, half %b) nounwind {
 ; LA32-NEXT:    fst.d $fs0, $sp, 0 # 8-byte Folded Spill
 ; LA32-NEXT:    move $fp, $a0
 ; LA32-NEXT:    move $a0, $a1
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fmov.s $fs0, $fa0
 ; LA32-NEXT:    move $a0, $fp
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fmul.s $fa0, $fa0, $fs0
-; LA32-NEXT:    bl %plt(__truncsfhf2)
+; LA32-NEXT:    bl __truncsfhf2
 ; LA32-NEXT:    fld.d $fs0, $sp, 0 # 8-byte Folded Reload
 ; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -290,12 +290,12 @@ define void @test_fmul_mem(ptr %p, ptr %q) nounwind {
 ; LA32-NEXT:    move $fp, $a0
 ; LA32-NEXT:    ld.hu $s0, $a0, 0
 ; LA32-NEXT:    ld.hu $a0, $a1, 0
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fmov.s $fs0, $fa0
 ; LA32-NEXT:    move $a0, $s0
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fmul.s $fa0, $fa0, $fs0
-; LA32-NEXT:    bl %plt(__truncsfhf2)
+; LA32-NEXT:    bl __truncsfhf2
 ; LA32-NEXT:    st.h $a0, $fp, 0
 ; LA32-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
 ; LA32-NEXT:    ld.w $s0, $sp, 20 # 4-byte Folded Reload
@@ -343,10 +343,10 @@ define half @freeze_half_undef() nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    movgr2fr.w $fa0, $zero
-; LA32-NEXT:    bl %plt(__truncsfhf2)
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __truncsfhf2
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fadd.s $fa0, $fa0, $fa0
-; LA32-NEXT:    bl %plt(__truncsfhf2)
+; LA32-NEXT:    bl __truncsfhf2
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -376,9 +376,9 @@ define half @freeze_half_poison(half %maybe.poison) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    fadd.s $fa0, $fa0, $fa0
-; LA32-NEXT:    bl %plt(__truncsfhf2)
+; LA32-NEXT:    bl __truncsfhf2
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -405,7 +405,7 @@ define signext i32 @test_half_to_s32(half %a) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    ftintrz.w.s $fa0, $fa0
 ; LA32-NEXT:    movfr2gr.s $a0, $fa0
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -433,7 +433,7 @@ define zeroext i32 @test_half_to_s32_u32(half %a) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__extendhfsf2)
+; LA32-NEXT:    bl __extendhfsf2
 ; LA32-NEXT:    ftintrz.w.s $fa0, $fa0
 ; LA32-NEXT:    movfr2gr.s $a0, $fa0
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
@@ -462,8 +462,8 @@ define i64 @test_half_to_i64(half %a) nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__extendhfsf2)
-; LA32-NEXT:    bl %plt(__fixsfdi)
+; LA32-NEXT:    bl __extendhfsf2
+; LA32-NEXT:    bl __fixsfdi
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/frint.ll b/llvm/test/CodeGen/LoongArch/frint.ll
index 48b1ff8..42c0a57 100644
--- a/llvm/test/CodeGen/LoongArch/frint.ll
+++ b/llvm/test/CodeGen/LoongArch/frint.ll
@@ -7,11 +7,11 @@
 define float @rint_f32(float %f) nounwind {
 ; LA32F-LABEL: rint_f32:
 ; LA32F:       # %bb.0: # %entry
-; LA32F-NEXT:    b %plt(rintf)
+; LA32F-NEXT:    b rintf
 ;
 ; LA32D-LABEL: rint_f32:
 ; LA32D:       # %bb.0: # %entry
-; LA32D-NEXT:    b %plt(rintf)
+; LA32D-NEXT:    b rintf
 ;
 ; LA64F-LABEL: rint_f32:
 ; LA64F:       # %bb.0: # %entry
@@ -34,14 +34,14 @@ define double @rint_f64(double %d) nounwind {
 ; LA32F:       # %bb.0: # %entry
 ; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-NEXT:    bl %plt(rint)
+; LA32F-NEXT:    bl rint
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
 ;
 ; LA32D-LABEL: rint_f64:
 ; LA32D:       # %bb.0: # %entry
-; LA32D-NEXT:    b %plt(rint)
+; LA32D-NEXT:    b rint
 ;
 ; LA64F-LABEL: rint_f64:
 ; LA64F:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll
index 5f14352..e5c848e 100644
--- a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll
+++ b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll
@@ -55,12 +55,12 @@ define double @frsqrt_f64(double %a) nounwind {
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    addi.w	$sp, $sp, -16
 ; LA32F-NEXT:    st.w	$ra, $sp, 12 
-; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    bl	sqrt
 ; LA32F-NEXT:    move	$a2, $a0
 ; LA32F-NEXT:    move	$a3, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 261888
 ; LA32F-NEXT:    move	$a0, $zero
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    ld.w	$ra, $sp, 12  
 ; LA32F-NEXT:    addi.w	$sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -69,12 +69,12 @@ define double @frsqrt_f64(double %a) nounwind {
 ; LA32F-FRECIPE:       # %bb.0:
 ; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, -16
 ; LA32F-FRECIPE-NEXT:    st.w	$ra, $sp, 12                    # 4-byte Folded Spill
-; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    bl	sqrt
 ; LA32F-FRECIPE-NEXT:    move	$a2, $a0
 ; LA32F-FRECIPE-NEXT:    move	$a3, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 261888
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    ld.w	$ra, $sp, 12                    # 4-byte Folded Reload
 ; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, 16
 ; LA32F-FRECIPE-NEXT:    ret
@@ -117,21 +117,21 @@ define double @sqrt_simplify_before_recip_3_uses_f64(double %x, ptr %p1, ptr %p2
 ; LA32F-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
 ; LA32F-NEXT:    move	$fp, $a3
 ; LA32F-NEXT:    move	$s0, $a2
-; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    bl	sqrt
 ; LA32F-NEXT:    move	$s1, $a0
 ; LA32F-NEXT:    move	$s2, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 261888
 ; LA32F-NEXT:    move	$a0, $zero
 ; LA32F-NEXT:    move	$a2, $s1
 ; LA32F-NEXT:    move	$a3, $s2
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    move	$s3, $a0
 ; LA32F-NEXT:    move	$s4, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 263248
 ; LA32F-NEXT:    move	$a0, $zero
 ; LA32F-NEXT:    move	$a2, $s1
 ; LA32F-NEXT:    move	$a3, $s2
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    st.w	$s3, $s0, 0
 ; LA32F-NEXT:    st.w	$s4, $s0, 4
 ; LA32F-NEXT:    st.w	$a0, $fp, 0
@@ -160,21 +160,21 @@ define double @sqrt_simplify_before_recip_3_uses_f64(double %x, ptr %p1, ptr %p2
 ; LA32F-FRECIPE-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
 ; LA32F-FRECIPE-NEXT:    move	$fp, $a3
 ; LA32F-FRECIPE-NEXT:    move	$s0, $a2
-; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    bl	sqrt
 ; LA32F-FRECIPE-NEXT:    move	$s1, $a0
 ; LA32F-FRECIPE-NEXT:    move	$s2, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 261888
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
 ; LA32F-FRECIPE-NEXT:    move	$a2, $s1
 ; LA32F-FRECIPE-NEXT:    move	$a3, $s2
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    move	$s3, $a0
 ; LA32F-FRECIPE-NEXT:    move	$s4, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263248
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
 ; LA32F-FRECIPE-NEXT:    move	$a2, $s1
 ; LA32F-FRECIPE-NEXT:    move	$a3, $s2
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    st.w	$s3, $s0, 0
 ; LA32F-FRECIPE-NEXT:    st.w	$s4, $s0, 4
 ; LA32F-FRECIPE-NEXT:    st.w	$a0, $fp, 0
@@ -247,21 +247,21 @@ define double @sqrt_simplify_before_recip_3_uses_order_f64(double %x, ptr %p1, p
 ; LA32F-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
 ; LA32F-NEXT:    move	$fp, $a3
 ; LA32F-NEXT:    move	$s0, $a2
-; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    bl	sqrt
 ; LA32F-NEXT:    move	$s1, $a0
 ; LA32F-NEXT:    move	$s2, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 263248
 ; LA32F-NEXT:    move	$a0, $zero
 ; LA32F-NEXT:    move	$a2, $s1
 ; LA32F-NEXT:    move	$a3, $s2
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    move	$s3, $a0
 ; LA32F-NEXT:    move	$s4, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 263256
 ; LA32F-NEXT:    move	$a0, $zero
 ; LA32F-NEXT:    move	$a2, $s1
 ; LA32F-NEXT:    move	$a3, $s2
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    st.w	$s3, $s0, 0
 ; LA32F-NEXT:    st.w	$s4, $s0, 4
 ; LA32F-NEXT:    st.w	$a0, $fp, 0
@@ -290,21 +290,21 @@ define double @sqrt_simplify_before_recip_3_uses_order_f64(double %x, ptr %p1, p
 ; LA32F-FRECIPE-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
 ; LA32F-FRECIPE-NEXT:    move	$fp, $a3
 ; LA32F-FRECIPE-NEXT:    move	$s0, $a2
-; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    bl	sqrt
 ; LA32F-FRECIPE-NEXT:    move	$s1, $a0
 ; LA32F-FRECIPE-NEXT:    move	$s2, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263248
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
 ; LA32F-FRECIPE-NEXT:    move	$a2, $s1
 ; LA32F-FRECIPE-NEXT:    move	$a3, $s2
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    move	$s3, $a0
 ; LA32F-FRECIPE-NEXT:    move	$s4, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263256
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
 ; LA32F-FRECIPE-NEXT:    move	$a2, $s1
 ; LA32F-FRECIPE-NEXT:    move	$a3, $s2
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    st.w	$s3, $s0, 0
 ; LA32F-FRECIPE-NEXT:    st.w	$s4, $s0, 4
 ; LA32F-FRECIPE-NEXT:    st.w	$a0, $fp, 0
@@ -384,28 +384,28 @@ define double @sqrt_simplify_before_recip_4_uses_f64(double %x, ptr %p1, ptr %p2
 ; LA32F-NEXT:    move	$fp, $a4
 ; LA32F-NEXT:    move	$s0, $a3
 ; LA32F-NEXT:    move	$s1, $a2
-; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    bl	sqrt
 ; LA32F-NEXT:    move	$s2, $a0
 ; LA32F-NEXT:    move	$s3, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 261888
 ; LA32F-NEXT:    move	$a0, $zero
 ; LA32F-NEXT:    move	$a2, $s2
 ; LA32F-NEXT:    move	$a3, $s3
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    move	$s4, $a0
 ; LA32F-NEXT:    move	$s5, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 263248
 ; LA32F-NEXT:    move	$a0, $zero
 ; LA32F-NEXT:    move	$a2, $s2
 ; LA32F-NEXT:    move	$a3, $s3
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    move	$s6, $a0
 ; LA32F-NEXT:    move	$s7, $a1
 ; LA32F-NEXT:    lu12i.w	$a1, 263256
 ; LA32F-NEXT:    move	$a0, $zero
 ; LA32F-NEXT:    move	$a2, $s2
 ; LA32F-NEXT:    move	$a3, $s3
-; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    bl	__divdf3
 ; LA32F-NEXT:    st.w	$s4, $s1, 0
 ; LA32F-NEXT:    st.w	$s5, $s1, 4
 ; LA32F-NEXT:    st.w	$s6, $s0, 0
@@ -443,28 +443,28 @@ define double @sqrt_simplify_before_recip_4_uses_f64(double %x, ptr %p1, ptr %p2
 ; LA32F-FRECIPE-NEXT:    move	$fp, $a4
 ; LA32F-FRECIPE-NEXT:    move	$s0, $a3
 ; LA32F-FRECIPE-NEXT:    move	$s1, $a2
-; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    bl	sqrt
 ; LA32F-FRECIPE-NEXT:    move	$s2, $a0
 ; LA32F-FRECIPE-NEXT:    move	$s3, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 261888
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
 ; LA32F-FRECIPE-NEXT:    move	$a2, $s2
 ; LA32F-FRECIPE-NEXT:    move	$a3, $s3
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    move	$s4, $a0
 ; LA32F-FRECIPE-NEXT:    move	$s5, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263248
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
 ; LA32F-FRECIPE-NEXT:    move	$a2, $s2
 ; LA32F-FRECIPE-NEXT:    move	$a3, $s3
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    move	$s6, $a0
 ; LA32F-FRECIPE-NEXT:    move	$s7, $a1
 ; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263256
 ; LA32F-FRECIPE-NEXT:    move	$a0, $zero
 ; LA32F-FRECIPE-NEXT:    move	$a2, $s2
 ; LA32F-FRECIPE-NEXT:    move	$a3, $s3
-; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    bl	__divdf3
 ; LA32F-FRECIPE-NEXT:    st.w	$s4, $s1, 0
 ; LA32F-FRECIPE-NEXT:    st.w	$s5, $s1, 4
 ; LA32F-FRECIPE-NEXT:    st.w	$s6, $s0, 0
diff --git a/llvm/test/CodeGen/LoongArch/fsqrt.ll b/llvm/test/CodeGen/LoongArch/fsqrt.ll
index c5f02ba..e0cb4d3 100644
--- a/llvm/test/CodeGen/LoongArch/fsqrt.ll
+++ b/llvm/test/CodeGen/LoongArch/fsqrt.ll
@@ -36,7 +36,7 @@ define double @fsqrt_f64(double %a) nounwind {
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-NEXT:    bl %plt(sqrt)
+; LA32F-NEXT:    bl sqrt
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -94,12 +94,12 @@ define double @frsqrt_f64(double %a) nounwind {
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-NEXT:    bl %plt(sqrt)
+; LA32F-NEXT:    bl sqrt
 ; LA32F-NEXT:    move $a2, $a0
 ; LA32F-NEXT:    move $a3, $a1
 ; LA32F-NEXT:    lu12i.w $a1, 261888
 ; LA32F-NEXT:    move $a0, $zero
-; LA32F-NEXT:    bl %plt(__divdf3)
+; LA32F-NEXT:    bl __divdf3
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-csr-side-effects.ll b/llvm/test/CodeGen/LoongArch/intrinsic-csr-side-effects.ll
index c2f77b0..68d2c10 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-csr-side-effects.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-csr-side-effects.ll
@@ -19,7 +19,7 @@ define dso_local void @foo(i32 noundef signext %flag) nounwind {
 ; LA32-NEXT:    andi $a0, $a0, 1
 ; LA32-NEXT:    bnez $a0, .LBB0_4
 ; LA32-NEXT:  # %bb.3: # %if.then2
-; LA32-NEXT:    b %plt(bug)
+; LA32-NEXT:    b bug
 ; LA32-NEXT:  .LBB0_4: # %if.end3
 ; LA32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
index b1af9c1..fc393e1 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
@@ -243,7 +243,7 @@ define i64 @atomicrmw_xchg_i64_acquire(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_exchange_8)
+; LA32-NEXT:    bl __atomic_exchange_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -374,7 +374,7 @@ define i64 @atomicrmw_add_i64_acquire(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
+; LA32-NEXT:    bl __atomic_fetch_add_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -506,7 +506,7 @@ define i64 @atomicrmw_sub_i64_acquire(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
+; LA32-NEXT:    bl __atomic_fetch_sub_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -649,7 +649,7 @@ define i64 @atomicrmw_nand_i64_acquire(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
+; LA32-NEXT:    bl __atomic_fetch_nand_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -768,7 +768,7 @@ define i64 @atomicrmw_and_i64_acquire(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
+; LA32-NEXT:    bl __atomic_fetch_and_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -867,7 +867,7 @@ define i64 @atomicrmw_or_i64_acquire(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
+; LA32-NEXT:    bl __atomic_fetch_or_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -966,7 +966,7 @@ define i64 @atomicrmw_xor_i64_acquire(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
+; LA32-NEXT:    bl __atomic_fetch_xor_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1221,7 +1221,7 @@ define i64 @atomicrmw_xchg_i64_release(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_exchange_8)
+; LA32-NEXT:    bl __atomic_exchange_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1352,7 +1352,7 @@ define i64 @atomicrmw_add_i64_release(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
+; LA32-NEXT:    bl __atomic_fetch_add_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1484,7 +1484,7 @@ define i64 @atomicrmw_sub_i64_release(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
+; LA32-NEXT:    bl __atomic_fetch_sub_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1627,7 +1627,7 @@ define i64 @atomicrmw_nand_i64_release(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
+; LA32-NEXT:    bl __atomic_fetch_nand_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1746,7 +1746,7 @@ define i64 @atomicrmw_and_i64_release(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
+; LA32-NEXT:    bl __atomic_fetch_and_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1845,7 +1845,7 @@ define i64 @atomicrmw_or_i64_release(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
+; LA32-NEXT:    bl __atomic_fetch_or_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1944,7 +1944,7 @@ define i64 @atomicrmw_xor_i64_release(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
+; LA32-NEXT:    bl __atomic_fetch_xor_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -2199,7 +2199,7 @@ define i64 @atomicrmw_xchg_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 4
-; LA32-NEXT:    bl %plt(__atomic_exchange_8)
+; LA32-NEXT:    bl __atomic_exchange_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -2330,7 +2330,7 @@ define i64 @atomicrmw_add_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 4
-; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
+; LA32-NEXT:    bl __atomic_fetch_add_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -2462,7 +2462,7 @@ define i64 @atomicrmw_sub_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 4
-; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
+; LA32-NEXT:    bl __atomic_fetch_sub_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -2605,7 +2605,7 @@ define i64 @atomicrmw_nand_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 4
-; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
+; LA32-NEXT:    bl __atomic_fetch_nand_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -2724,7 +2724,7 @@ define i64 @atomicrmw_and_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 4
-; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
+; LA32-NEXT:    bl __atomic_fetch_and_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -2823,7 +2823,7 @@ define i64 @atomicrmw_or_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 4
-; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
+; LA32-NEXT:    bl __atomic_fetch_or_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -2922,7 +2922,7 @@ define i64 @atomicrmw_xor_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 4
-; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
+; LA32-NEXT:    bl __atomic_fetch_xor_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -3177,7 +3177,7 @@ define i64 @atomicrmw_xchg_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_exchange_8)
+; LA32-NEXT:    bl __atomic_exchange_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -3308,7 +3308,7 @@ define i64 @atomicrmw_add_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
+; LA32-NEXT:    bl __atomic_fetch_add_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -3440,7 +3440,7 @@ define i64 @atomicrmw_sub_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
+; LA32-NEXT:    bl __atomic_fetch_sub_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -3583,7 +3583,7 @@ define i64 @atomicrmw_nand_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
+; LA32-NEXT:    bl __atomic_fetch_nand_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -3702,7 +3702,7 @@ define i64 @atomicrmw_and_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
+; LA32-NEXT:    bl __atomic_fetch_and_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -3801,7 +3801,7 @@ define i64 @atomicrmw_or_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
+; LA32-NEXT:    bl __atomic_fetch_or_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -3900,7 +3900,7 @@ define i64 @atomicrmw_xor_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
+; LA32-NEXT:    bl __atomic_fetch_xor_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -4155,7 +4155,7 @@ define i64 @atomicrmw_xchg_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_exchange_8)
+; LA32-NEXT:    bl __atomic_exchange_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -4286,7 +4286,7 @@ define i64 @atomicrmw_add_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
+; LA32-NEXT:    bl __atomic_fetch_add_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -4418,7 +4418,7 @@ define i64 @atomicrmw_sub_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
+; LA32-NEXT:    bl __atomic_fetch_sub_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -4561,7 +4561,7 @@ define i64 @atomicrmw_nand_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
+; LA32-NEXT:    bl __atomic_fetch_nand_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -4680,7 +4680,7 @@ define i64 @atomicrmw_and_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
+; LA32-NEXT:    bl __atomic_fetch_and_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -4779,7 +4779,7 @@ define i64 @atomicrmw_or_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
+; LA32-NEXT:    bl __atomic_fetch_or_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -4878,7 +4878,7 @@ define i64 @atomicrmw_xor_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
+; LA32-NEXT:    bl __atomic_fetch_xor_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/call.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/call.ll
index 653af9f..45e5576 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/call.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/call.ll
@@ -9,7 +9,7 @@ define i32 @test_call_external(i32 %a) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(external_function)
+; LA32-NEXT:    bl external_function
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -46,7 +46,7 @@ define i32 @test_call_defined(i32 %a) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(defined_function)
+; LA32-NEXT:    bl defined_function
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
index 8d08942..14682d6 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
@@ -83,7 +83,7 @@ define double @convert_i64_to_double(i64 %a) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__floatdidf)
+; LA32-NEXT:    bl __floatdidf
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -146,7 +146,7 @@ define i64 @convert_double_to_i64(double %a) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__fixdfdi)
+; LA32-NEXT:    bl __fixdfdi
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -165,7 +165,7 @@ define i64 @convert_double_to_u64(double %a) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__fixunsdfdi)
+; LA32-NEXT:    bl __fixunsdfdi
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -252,7 +252,7 @@ define double @convert_u64_to_double(i64 %a) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__floatundidf)
+; LA32-NEXT:    bl __floatundidf
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
index e4cfd7a..ac1e1df 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
@@ -93,7 +93,7 @@ define i64 @convert_float_to_i64(float %a) nounwind {
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-NEXT:    bl %plt(__fixsfdi)
+; LA32F-NEXT:    bl __fixsfdi
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -102,7 +102,7 @@ define i64 @convert_float_to_i64(float %a) nounwind {
 ; LA32D:       # %bb.0:
 ; LA32D-NEXT:    addi.w $sp, $sp, -16
 ; LA32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32D-NEXT:    bl %plt(__fixsfdi)
+; LA32D-NEXT:    bl __fixsfdi
 ; LA32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32D-NEXT:    addi.w $sp, $sp, 16
 ; LA32D-NEXT:    ret
@@ -247,7 +247,7 @@ define i64 @convert_float_to_u64(float %a) nounwind {
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-NEXT:    bl %plt(__fixunssfdi)
+; LA32F-NEXT:    bl __fixunssfdi
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -256,7 +256,7 @@ define i64 @convert_float_to_u64(float %a) nounwind {
 ; LA32D:       # %bb.0:
 ; LA32D-NEXT:    addi.w $sp, $sp, -16
 ; LA32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32D-NEXT:    bl %plt(__fixunssfdi)
+; LA32D-NEXT:    bl __fixunssfdi
 ; LA32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32D-NEXT:    addi.w $sp, $sp, 16
 ; LA32D-NEXT:    ret
@@ -389,7 +389,7 @@ define float @convert_i64_to_float(i64 %a) nounwind {
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-NEXT:    bl %plt(__floatdisf)
+; LA32F-NEXT:    bl __floatdisf
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -398,7 +398,7 @@ define float @convert_i64_to_float(i64 %a) nounwind {
 ; LA32D:       # %bb.0:
 ; LA32D-NEXT:    addi.w $sp, $sp, -16
 ; LA32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32D-NEXT:    bl %plt(__floatdisf)
+; LA32D-NEXT:    bl __floatdisf
 ; LA32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32D-NEXT:    addi.w $sp, $sp, 16
 ; LA32D-NEXT:    ret
@@ -534,7 +534,7 @@ define float @convert_u64_to_float(i64 %a) nounwind {
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    addi.w $sp, $sp, -16
 ; LA32F-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32F-NEXT:    bl %plt(__floatundisf)
+; LA32F-NEXT:    bl __floatundisf
 ; LA32F-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32F-NEXT:    addi.w $sp, $sp, 16
 ; LA32F-NEXT:    ret
@@ -543,7 +543,7 @@ define float @convert_u64_to_float(i64 %a) nounwind {
 ; LA32D:       # %bb.0:
 ; LA32D-NEXT:    addi.w $sp, $sp, -16
 ; LA32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32D-NEXT:    bl %plt(__floatundisf)
+; LA32D-NEXT:    bl __floatundisf
 ; LA32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32D-NEXT:    addi.w $sp, $sp, 16
 ; LA32D-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
index 9ef74e4..78cabd3 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
@@ -58,7 +58,7 @@ define i64 @load_acquire_i64(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a1, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -114,7 +114,7 @@ define double @load_acquire_double(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a1, $zero, 2
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    st.w $a1, $sp, 4
 ; LA32-NEXT:    st.w $a0, $sp, 0
 ; LA32-NEXT:    fld.d $fa0, $sp, 0
@@ -182,7 +182,7 @@ define i64 @load_unordered_i64(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -233,7 +233,7 @@ define double @load_unordered_double(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    st.w $a1, $sp, 4
 ; LA32-NEXT:    st.w $a0, $sp, 0
 ; LA32-NEXT:    fld.d $fa0, $sp, 0
@@ -300,7 +300,7 @@ define i64 @load_monotonic_i64(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -351,7 +351,7 @@ define double @load_monotonic_double(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    st.w $a1, $sp, 4
 ; LA32-NEXT:    st.w $a0, $sp, 0
 ; LA32-NEXT:    fld.d $fa0, $sp, 0
@@ -424,7 +424,7 @@ define i64 @load_seq_cst_i64(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a1, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -480,7 +480,7 @@ define double @load_seq_cst_double(ptr %ptr) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a1, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_load_8)
+; LA32-NEXT:    bl __atomic_load_8
 ; LA32-NEXT:    st.w $a1, $sp, 4
 ; LA32-NEXT:    st.w $a0, $sp, 0
 ; LA32-NEXT:    fld.d $fa0, $sp, 0
@@ -553,7 +553,7 @@ define void @store_release_i64(ptr %ptr, i64 %v) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -609,7 +609,7 @@ define void @store_release_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    ld.w $a1, $sp, 0
 ; LA32-NEXT:    ld.w $a2, $sp, 4
 ; LA32-NEXT:    ori $a3, $zero, 3
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -673,7 +673,7 @@ define void @store_unordered_i64(ptr %ptr, i64 %v) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -727,7 +727,7 @@ define void @store_unordered_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    ld.w $a1, $sp, 0
 ; LA32-NEXT:    ld.w $a2, $sp, 4
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -791,7 +791,7 @@ define void @store_monotonic_i64(ptr %ptr, i64 %v) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -845,7 +845,7 @@ define void @store_monotonic_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    ld.w $a1, $sp, 0
 ; LA32-NEXT:    ld.w $a2, $sp, 4
 ; LA32-NEXT:    move $a3, $zero
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -919,7 +919,7 @@ define void @store_seq_cst_i64(ptr %ptr, i64 %v) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -977,7 +977,7 @@ define void @store_seq_cst_double(ptr %ptr, double %v) {
 ; LA32-NEXT:    ld.w $a1, $sp, 0
 ; LA32-NEXT:    ld.w $a2, $sp, 4
 ; LA32-NEXT:    ori $a3, $zero, 5
-; LA32-NEXT:    bl %plt(__atomic_store_8)
+; LA32-NEXT:    bl __atomic_store_8
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll
index 99824f6..d88f295 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll
@@ -258,7 +258,7 @@ define i64 @sdiv_i64(i64 %a, i64 %b) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    bl %plt(__divdi3)
+; LA32-NEXT:    bl __divdi3
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -274,7 +274,7 @@ define i64 @sdiv_i64(i64 %a, i64 %b) {
 ; LA32-TRAP-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-TRAP-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-TRAP-NEXT:    .cfi_offset 1, -4
-; LA32-TRAP-NEXT:    bl %plt(__divdi3)
+; LA32-TRAP-NEXT:    bl __divdi3
 ; LA32-TRAP-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-TRAP-NEXT:    addi.w $sp, $sp, 16
 ; LA32-TRAP-NEXT:    ret
@@ -542,7 +542,7 @@ define i64 @udiv_i64(i64 %a, i64 %b) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    bl %plt(__udivdi3)
+; LA32-NEXT:    bl __udivdi3
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -558,7 +558,7 @@ define i64 @udiv_i64(i64 %a, i64 %b) {
 ; LA32-TRAP-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-TRAP-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-TRAP-NEXT:    .cfi_offset 1, -4
-; LA32-TRAP-NEXT:    bl %plt(__udivdi3)
+; LA32-TRAP-NEXT:    bl __udivdi3
 ; LA32-TRAP-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-TRAP-NEXT:    addi.w $sp, $sp, 16
 ; LA32-TRAP-NEXT:    ret
@@ -830,7 +830,7 @@ define i64 @srem_i64(i64 %a, i64 %b) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    bl %plt(__moddi3)
+; LA32-NEXT:    bl __moddi3
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -846,7 +846,7 @@ define i64 @srem_i64(i64 %a, i64 %b) {
 ; LA32-TRAP-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-TRAP-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-TRAP-NEXT:    .cfi_offset 1, -4
-; LA32-TRAP-NEXT:    bl %plt(__moddi3)
+; LA32-TRAP-NEXT:    bl __moddi3
 ; LA32-TRAP-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-TRAP-NEXT:    addi.w $sp, $sp, 16
 ; LA32-TRAP-NEXT:    ret
@@ -1118,7 +1118,7 @@ define i64 @urem_i64(i64 %a, i64 %b) {
 ; LA32-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    bl %plt(__umoddi3)
+; LA32-NEXT:    bl __umoddi3
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1134,7 +1134,7 @@ define i64 @urem_i64(i64 %a, i64 %b) {
 ; LA32-TRAP-NEXT:    .cfi_def_cfa_offset 16
 ; LA32-TRAP-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-TRAP-NEXT:    .cfi_offset 1, -4
-; LA32-TRAP-NEXT:    bl %plt(__umoddi3)
+; LA32-TRAP-NEXT:    bl __umoddi3
 ; LA32-TRAP-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-TRAP-NEXT:    addi.w $sp, $sp, 16
 ; LA32-TRAP-NEXT:    ret
@@ -1164,7 +1164,7 @@ define signext i32 @pr107414(i32 signext %x) {
 ; LA32-NEXT:    lu12i.w $a0, -266831
 ; LA32-NEXT:    ori $a0, $a0, 3337
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    bl %plt(__divdi3)
+; LA32-NEXT:    bl __divdi3
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -1189,7 +1189,7 @@ define signext i32 @pr107414(i32 signext %x) {
 ; LA32-TRAP-NEXT:    lu12i.w $a0, -266831
 ; LA32-TRAP-NEXT:    ori $a0, $a0, 3337
 ; LA32-TRAP-NEXT:    move $a1, $zero
-; LA32-TRAP-NEXT:    bl %plt(__divdi3)
+; LA32-TRAP-NEXT:    bl __divdi3
 ; LA32-TRAP-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-TRAP-NEXT:    addi.w $sp, $sp, 16
 ; LA32-TRAP-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
index 0d9f57b..ed333c3 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
@@ -31,9 +31,9 @@ define void @fptosi_v4f64_v4i32(ptr %res, ptr %in){
 ; CHECK-LABEL: fptosi_v4f64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.l.d $xr0, $xr0
 ; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
-; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <4 x double>, ptr %in
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
index 27d70f3..9c499ba 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
@@ -31,9 +31,9 @@ define void @fptoui_v4f64_v4i32(ptr %res, ptr %in){
 ; CHECK-LABEL: fptoui_v4f64_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.lu.d $xr0, $xr0
 ; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
-; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <4 x double>, ptr %in
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index 0f9275f..9142e71 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -212,7 +212,7 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA32-NEXT:  .LBB3_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA32-NEXT:    move $a0, $s0
-; LA32-NEXT:    bl %plt(__tls_get_addr)
+; LA32-NEXT:    bl __tls_get_addr
 ; LA32-NEXT:    ld.w $zero, $a0, 0
 ; LA32-NEXT:    addi.w $s1, $s1, 1
 ; LA32-NEXT:    blt $s1, $fp, .LBB3_1
@@ -388,7 +388,7 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA32-NEXT:  .LBB5_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA32-NEXT:    move $a0, $s0
-; LA32-NEXT:    bl %plt(__tls_get_addr)
+; LA32-NEXT:    bl __tls_get_addr
 ; LA32-NEXT:    ld.w $zero, $a0, 0
 ; LA32-NEXT:    addi.w $s1, $s1, 1
 ; LA32-NEXT:    blt $s1, $fp, .LBB5_1
diff --git a/llvm/test/CodeGen/LoongArch/numeric-reg-names.ll b/llvm/test/CodeGen/LoongArch/numeric-reg-names.ll
index 5a11583..73f4dbb 100644
--- a/llvm/test/CodeGen/LoongArch/numeric-reg-names.ll
+++ b/llvm/test/CodeGen/LoongArch/numeric-reg-names.ll
@@ -17,7 +17,7 @@ define i32 @main() {
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    pcalau12i $r4, %pc_hi20(.str_1)
 ; LA32-NEXT:    addi.w $r4, $r4, %pc_lo12(.str_1)
-; LA32-NEXT:    bl %plt(printf)
+; LA32-NEXT:    bl printf
 ; LA32-NEXT:    move $r4, $r0
 ; LA32-NEXT:    ld.w $r1, $r3, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $r3, $r3, 16
diff --git a/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll b/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll
index c429d31..e7f75cd 100644
--- a/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll
+++ b/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll
@@ -16,7 +16,7 @@ define i32 @fptosi_i32_fp128(fp128 %X) nounwind {
 ; LA32-NEXT:    st.w $a2, $sp, 12
 ; LA32-NEXT:    addi.w $a0, $sp, 8
 ; LA32-NEXT:    st.w $a1, $sp, 8
-; LA32-NEXT:    bl %plt(__fixtfsi)
+; LA32-NEXT:    bl __fixtfsi
 ; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 32
 ; LA32-NEXT:    ret
@@ -39,7 +39,7 @@ define i32 @fptosi_i32_double(double %X) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__fixdfsi)
+; LA32-NEXT:    bl __fixdfsi
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -62,7 +62,7 @@ define i32 @fptosi_i32_float(float %X) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__fixsfsi)
+; LA32-NEXT:    bl __fixsfsi
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -94,7 +94,7 @@ define i64 @fptosi_i64_fp128(fp128 %X) nounwind {
 ; LA32-NEXT:    st.w $a2, $sp, 4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
 ; LA32-NEXT:    st.w $a1, $sp, 0
-; LA32-NEXT:    bl %plt(__fixtfdi)
+; LA32-NEXT:    bl __fixtfdi
 ; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 32
 ; LA32-NEXT:    ret
@@ -117,7 +117,7 @@ define i64 @fptosi_i64_double(double %X) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__fixdfdi)
+; LA32-NEXT:    bl __fixdfdi
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -140,7 +140,7 @@ define i64 @fptosi_i64_float(float %X) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    bl %plt(__fixsfdi)
+; LA32-NEXT:    bl __fixsfdi
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/spill-reload-cfr.ll b/llvm/test/CodeGen/LoongArch/spill-reload-cfr.ll
index 9cb364e..66640ca 100644
--- a/llvm/test/CodeGen/LoongArch/spill-reload-cfr.ll
+++ b/llvm/test/CodeGen/LoongArch/spill-reload-cfr.ll
@@ -18,7 +18,7 @@ define i1 @load_store_fcc_reg(float %a, i1 %c) {
 ; LA32-NEXT:    .cfi_offset 56, -16
 ; LA32-NEXT:    move $fp, $a0
 ; LA32-NEXT:    fmov.s $fs0, $fa0
-; LA32-NEXT:    bl %plt(foo)
+; LA32-NEXT:    bl foo
 ; LA32-NEXT:    movgr2fr.w $fa0, $zero
 ; LA32-NEXT:    fcmp.cult.s $fcc0, $fa0, $fs0
 ; LA32-NEXT:    bcnez $fcc0, .LBB0_2
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
index f6c6c5a..fe7a8f8 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
@@ -26,7 +26,7 @@ define void @caller(i32 %n) {
 ; LA32-NEXT:    sub.w $a0, $sp, $a0
 ; LA32-NEXT:    move $sp, $a0
 ; LA32-NEXT:    addi.w $a1, $s8, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    addi.w $sp, $fp, -64
 ; LA32-NEXT:    ld.w $s8, $sp, 52 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
index 6d4210e..0645339 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -19,7 +19,7 @@ define void @caller32() {
 ; LA32-NEXT:    .cfi_def_cfa 22, 0
 ; LA32-NEXT:    bstrins.w $sp, $zero, 4, 0
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    addi.w $sp, $fp, -32
 ; LA32-NEXT:    ld.w $fp, $sp, 24 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
@@ -58,7 +58,7 @@ define void @caller_no_realign32() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -93,7 +93,7 @@ define void @caller64() {
 ; LA32-NEXT:    .cfi_def_cfa 22, 0
 ; LA32-NEXT:    bstrins.w $sp, $zero, 5, 0
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    addi.w $sp, $fp, -64
 ; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
@@ -132,7 +132,7 @@ define void @caller_no_realign64() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -167,7 +167,7 @@ define void @caller128() {
 ; LA32-NEXT:    .cfi_def_cfa 22, 0
 ; LA32-NEXT:    bstrins.w $sp, $zero, 6, 0
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    addi.w $sp, $fp, -128
 ; LA32-NEXT:    ld.w $fp, $sp, 120 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 124 # 4-byte Folded Reload
@@ -206,7 +206,7 @@ define void @caller_no_realign128() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -241,7 +241,7 @@ define void @caller256() {
 ; LA32-NEXT:    .cfi_def_cfa 22, 0
 ; LA32-NEXT:    bstrins.w $sp, $zero, 7, 0
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    addi.w $sp, $fp, -256
 ; LA32-NEXT:    ld.w $fp, $sp, 248 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 252 # 4-byte Folded Reload
@@ -280,7 +280,7 @@ define void @caller_no_realign256() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -315,7 +315,7 @@ define void @caller512() {
 ; LA32-NEXT:    .cfi_def_cfa 22, 0
 ; LA32-NEXT:    bstrins.w $sp, $zero, 8, 0
 ; LA32-NEXT:    addi.w $a0, $sp, 512
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    addi.w $sp, $fp, -1024
 ; LA32-NEXT:    ld.w $fp, $sp, 1016 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $ra, $sp, 1020 # 4-byte Folded Reload
@@ -354,7 +354,7 @@ define void @caller_no_realign512() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -390,7 +390,7 @@ define void @caller1024() {
 ; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    bstrins.w $sp, $zero, 9, 0
 ; LA32-NEXT:    addi.w $a0, $sp, 1024
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    addi.w $sp, $fp, -2048
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ld.w $fp, $sp, 2024 # 4-byte Folded Reload
@@ -432,7 +432,7 @@ define void @caller_no_realign1024() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -470,7 +470,7 @@ define void @caller2048() {
 ; LA32-NEXT:    bstrins.w $sp, $zero, 10, 0
 ; LA32-NEXT:    ori $a0, $zero, 2048
 ; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    lu12i.w $a0, 1
 ; LA32-NEXT:    sub.w $sp, $fp, $a0
 ; LA32-NEXT:    addi.w $sp, $sp, 2032
@@ -518,7 +518,7 @@ define void @caller_no_realign2048() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -557,7 +557,7 @@ define void @caller4096() {
 ; LA32-NEXT:    bstrins.w $sp, $zero, 11, 0
 ; LA32-NEXT:    lu12i.w $a0, 1
 ; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    lu12i.w $a0, 2
 ; LA32-NEXT:    sub.w $sp, $fp, $a0
 ; LA32-NEXT:    lu12i.w $a0, 1
@@ -608,7 +608,7 @@ define void @caller_no_realign4096() "no-realign-stack" {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    addi.w $a0, $sp, 0
-; LA32-NEXT:    bl %plt(callee)
+; LA32-NEXT:    bl callee
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll b/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll
index 1926dbd..a037282 100644
--- a/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll
+++ b/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll
@@ -19,7 +19,7 @@ define i1 @test_i1_return() nounwind gc "statepoint-example" {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    bl %plt(return_i1)
+; CHECK-NEXT:    bl return_i1
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -35,7 +35,7 @@ define i32 @test_i32_return() nounwind gc "statepoint-example" {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    bl %plt(return_i32)
+; CHECK-NEXT:    bl return_i32
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -51,7 +51,7 @@ define ptr @test_i32ptr_return() nounwind gc "statepoint-example" {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    bl %plt(return_i32ptr)
+; CHECK-NEXT:    bl return_i32ptr
 ; CHECK-NEXT:  .Ltmp2:
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -67,7 +67,7 @@ define float @test_float_return() nounwind gc "statepoint-example" {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    bl %plt(return_float)
+; CHECK-NEXT:    bl return_float
 ; CHECK-NEXT:  .Ltmp3:
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -83,7 +83,7 @@ define %struct @test_struct_return() nounwind gc "statepoint-example" {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    bl %plt(return_struct)
+; CHECK-NEXT:    bl return_struct
 ; CHECK-NEXT:  .Ltmp4:
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -100,7 +100,7 @@ define i1 @test_relocate(ptr addrspace(1) %a) nounwind gc "statepoint-example" {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    bl %plt(return_i1)
+; CHECK-NEXT:    bl return_i1
 ; CHECK-NEXT:  .Ltmp5:
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -119,7 +119,7 @@ define void @test_void_vararg() nounwind gc "statepoint-example" {
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; CHECK-NEXT:    ori $a0, $zero, 42
 ; CHECK-NEXT:    ori $a1, $zero, 43
-; CHECK-NEXT:    bl %plt(varargf)
+; CHECK-NEXT:    bl varargf
 ; CHECK-NEXT:  .Ltmp6:
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -158,7 +158,7 @@ define i1 @test_cross_bb(ptr addrspace(1) %a, i1 %external_cond) nounwind gc "st
 ; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
 ; CHECK-NEXT:    andi $fp, $a1, 1
 ; CHECK-NEXT:    st.d $a0, $sp, 8
-; CHECK-NEXT:    bl %plt(return_i1)
+; CHECK-NEXT:    bl return_i1
 ; CHECK-NEXT:  .Ltmp8:
 ; CHECK-NEXT:    beqz $fp, .LBB8_2
 ; CHECK-NEXT:  # %bb.1: # %left
@@ -207,7 +207,7 @@ define void @test_attributes(ptr byval(%struct2) %s) nounwind gc "statepoint-exa
 ; CHECK-NEXT:    ori $a2, $zero, 17
 ; CHECK-NEXT:    addi.d $a3, $sp, 0
 ; CHECK-NEXT:    move $a1, $zero
-; CHECK-NEXT:    bl %plt(consume_attributes)
+; CHECK-NEXT:    bl consume_attributes
 ; CHECK-NEXT:  .Ltmp9:
 ; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 32
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index ffd480a..50d994f 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -30,7 +30,7 @@ define ptr @f1() nounwind {
 ; LA32PIC-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32PIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
 ; LA32PIC-NEXT:    addi.w $a0, $a0, %got_pc_lo12(unspecified)
-; LA32PIC-NEXT:    bl %plt(__tls_get_addr)
+; LA32PIC-NEXT:    bl __tls_get_addr
 ; LA32PIC-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32PIC-NEXT:    addi.w $sp, $sp, 16
 ; LA32PIC-NEXT:    ret
@@ -144,7 +144,7 @@ define ptr @f2() nounwind {
 ; LA32PIC-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32PIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
 ; LA32PIC-NEXT:    addi.w $a0, $a0, %got_pc_lo12(ld)
-; LA32PIC-NEXT:    bl %plt(__tls_get_addr)
+; LA32PIC-NEXT:    bl __tls_get_addr
 ; LA32PIC-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32PIC-NEXT:    addi.w $sp, $sp, 16
 ; LA32PIC-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
index 5ab20b8..925fdf3 100644
--- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
@@ -15,7 +15,7 @@ define void @t0(ptr %out, ptr %in) {
 ; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    ori $a2, $zero, 16
-; LA32-NEXT:    bl %plt(memcpy)
+; LA32-NEXT:    bl memcpy
 ; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
index d043eef..80c17a7 100644
--- a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
@@ -379,14 +379,14 @@ define void @test_d2(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    move $a0, $a2
 ; LA32F-NEXT:    move $a1, $a4
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s2, $a0
 ; LA32F-NEXT:    move $s3, $a1
 ; LA32F-NEXT:    lu12i.w $a3, 262144
 ; LA32F-NEXT:    move $a0, $fp
 ; LA32F-NEXT:    move $a1, $s0
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    st.w $a0, $s1, 8
 ; LA32F-NEXT:    st.w $a1, $s1, 12
 ; LA32F-NEXT:    st.w $s2, $s1, 0
@@ -484,28 +484,28 @@ define void @test_d4(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    move $a0, $a2
 ; LA32F-NEXT:    move $a1, $a4
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s6, $a0
 ; LA32F-NEXT:    move $s7, $a1
 ; LA32F-NEXT:    lu12i.w $a3, 262144
 ; LA32F-NEXT:    move $a0, $s3
 ; LA32F-NEXT:    move $a1, $s4
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s3, $a0
 ; LA32F-NEXT:    move $s4, $a1
 ; LA32F-NEXT:    lu12i.w $a3, 262272
 ; LA32F-NEXT:    move $a0, $s1
 ; LA32F-NEXT:    move $a1, $s2
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s1, $a0
 ; LA32F-NEXT:    move $s2, $a1
 ; LA32F-NEXT:    lu12i.w $a3, 262400
 ; LA32F-NEXT:    move $a0, $fp
 ; LA32F-NEXT:    move $a1, $s0
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    st.w $a0, $s5, 24
 ; LA32F-NEXT:    st.w $a1, $s5, 28
 ; LA32F-NEXT:    st.w $s1, $s5, 16
@@ -660,7 +660,7 @@ define void @test_d8(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    move $a0, $a2
 ; LA32F-NEXT:    move $a1, $a4
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    st.w $a0, $sp, 40 # 4-byte Folded Spill
 ; LA32F-NEXT:    st.w $a1, $sp, 36 # 4-byte Folded Spill
 ; LA32F-NEXT:    lu12i.w $a3, 262144
@@ -668,7 +668,7 @@ define void @test_d8(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    move $a1, $s0
 ; LA32F-NEXT:    move $a2, $zero
 ; LA32F-NEXT:    move $s0, $a3
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    st.w $a0, $sp, 24 # 4-byte Folded Spill
 ; LA32F-NEXT:    st.w $a1, $sp, 20 # 4-byte Folded Spill
 ; LA32F-NEXT:    lu12i.w $s7, 262272
@@ -676,42 +676,42 @@ define void @test_d8(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    move $a1, $s2
 ; LA32F-NEXT:    move $a2, $zero
 ; LA32F-NEXT:    move $a3, $s7
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    st.w $a0, $sp, 12 # 4-byte Folded Spill
 ; LA32F-NEXT:    move $s2, $a1
 ; LA32F-NEXT:    lu12i.w $a3, 262400
 ; LA32F-NEXT:    move $a0, $s5
 ; LA32F-NEXT:    move $a1, $s6
 ; LA32F-NEXT:    move $a2, $zero
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s5, $a0
 ; LA32F-NEXT:    move $s6, $a1
 ; LA32F-NEXT:    move $a0, $s3
 ; LA32F-NEXT:    move $a1, $s4
 ; LA32F-NEXT:    move $a2, $zero
 ; LA32F-NEXT:    lu12i.w $a3, 261888
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s3, $a0
 ; LA32F-NEXT:    move $s4, $a1
 ; LA32F-NEXT:    move $a0, $s8
 ; LA32F-NEXT:    ld.w $a1, $sp, 16 # 4-byte Folded Reload
 ; LA32F-NEXT:    move $a2, $zero
 ; LA32F-NEXT:    move $a3, $s0
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s8, $a0
 ; LA32F-NEXT:    move $s0, $a1
 ; LA32F-NEXT:    ld.w $a0, $sp, 32 # 4-byte Folded Reload
 ; LA32F-NEXT:    ld.w $a1, $sp, 28 # 4-byte Folded Reload
 ; LA32F-NEXT:    move $a2, $zero
 ; LA32F-NEXT:    move $a3, $s7
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    move $s7, $a0
 ; LA32F-NEXT:    move $s1, $a1
 ; LA32F-NEXT:    ld.w $a0, $sp, 48 # 4-byte Folded Reload
 ; LA32F-NEXT:    ld.w $a1, $sp, 44 # 4-byte Folded Reload
 ; LA32F-NEXT:    move $a2, $zero
 ; LA32F-NEXT:    lu12i.w $a3, 262400
-; LA32F-NEXT:    bl %plt(__adddf3)
+; LA32F-NEXT:    bl __adddf3
 ; LA32F-NEXT:    st.w $a0, $fp, 56
 ; LA32F-NEXT:    st.w $a1, $fp, 60
 ; LA32F-NEXT:    st.w $s7, $fp, 48
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/not.ll b/llvm/test/CodeGen/Mips/llvm-ir/not.ll
index 58ed0e6..4fa5074 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/not.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/not.ll
@@ -1,228 +1,346 @@
-; RUN: llc < %s -mtriple=mips -mcpu=mips2 | FileCheck %s -check-prefixes=ALL,GP32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32 | FileCheck %s -check-prefixes=ALL,GP32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP32
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,GP64
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MM,MM32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips2 | FileCheck %s -check-prefixes=GP32
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32 | FileCheck %s -check-prefixes=GP32
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefixes=GP32
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r3 | FileCheck %s \
+; RUN:    -check-prefixes=GP32
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r5 | FileCheck %s \
+; RUN:    -check-prefixes=GP32
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefixes=GP32
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips3 | FileCheck %s \
+; RUN:    -check-prefixes=GP64
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefixes=GP64
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefixes=GP64
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefixes=GP64
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64r3 | FileCheck %s \
+; RUN:    -check-prefixes=GP64
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64r5 | FileCheck %s \
+; RUN:    -check-prefixes=GP64
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64r6 | FileCheck %s \
+; RUN:    -check-prefixes=GP64
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN:    -check-prefixes=MM,MM32r3
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN:    -check-prefixes=MM,MM32r6
 
 define signext i1 @not_i1(i1 signext %a) {
+; GP32-LABEL: not_i1:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    not $2, $4
+;
+; GP64-LABEL: not_i1:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    not $2, $4
+;
+; MM-LABEL: not_i1:
+; MM:       # %bb.0: # %entry
+; MM-NEXT:    not16 $2, $4
+; MM-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: not_i1:
-
-  ; GP32:         not     $2, $4
-
-  ; GP64:         not     $2, $4
-
-  ; MM:           not16   $2, $4
-
   %r = xor i1 %a, -1
   ret i1 %r
 }
 
 define signext i8 @not_i8(i8 signext %a) {
+; GP32-LABEL: not_i8:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    not $2, $4
+;
+; GP64-LABEL: not_i8:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    not $2, $4
+;
+; MM-LABEL: not_i8:
+; MM:       # %bb.0: # %entry
+; MM-NEXT:    not16 $2, $4
+; MM-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: not_i8:
-
-  ; GP32:         not     $2, $4
-
-  ; GP64:         not     $2, $4
-
-  ; MM:           not16   $2, $4
-
   %r = xor i8 %a, -1
   ret i8 %r
 }
 
 define signext i16 @not_i16(i16 signext %a) {
+; GP32-LABEL: not_i16:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    not $2, $4
+;
+; GP64-LABEL: not_i16:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    not $2, $4
+;
+; MM-LABEL: not_i16:
+; MM:       # %bb.0: # %entry
+; MM-NEXT:    not16 $2, $4
+; MM-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: not_i16:
-
-  ; GP32:         not     $2, $4
-
-  ; GP64:         not     $2, $4
-
-  ; MM:           not16   $2, $4
-
   %r = xor i16 %a, -1
   ret i16 %r
 }
 
 define signext i32 @not_i32(i32 signext %a) {
+; GP32-LABEL: not_i32:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    not $2, $4
+;
+; GP64-LABEL: not_i32:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    daddiu $1, $zero, -1
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    xor $2, $4, $1
+;
+; MM-LABEL: not_i32:
+; MM:       # %bb.0: # %entry
+; MM-NEXT:    not16 $2, $4
+; MM-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: not_i32:
-
-  ; GP32:         not     $2, $4
-
-  ; GP64:         not     $1, $4
-  ; GP64:         sll     $2, $1, 0
-
-  ; MM:           not16   $2, $4
-
   %r = xor i32 %a, -1
   ret i32 %r
 }
 
 define signext i64 @not_i64(i64 signext %a) {
+; GP32-LABEL: not_i64:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    not $2, $4
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    not $3, $5
+;
+; GP64-LABEL: not_i64:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    daddiu $1, $zero, -1
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    xor $2, $4, $1
+;
+; MM-LABEL: not_i64:
+; MM:       # %bb.0: # %entry
+; MM-NEXT:    not16 $2, $4
+; MM-NEXT:    not16 $3, $5
+; MM-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: not_i64:
-
-  ; GP32:         not     $2, $4
-  ; GP32:         not     $3, $5
-
-  ; GP64:         daddiu  $[[T0:[0-9]+]], $zero, -1
-  ; GP64:         xor     $2, $4, $[[T0]]
-
-  ; MM32:         not16   $2, $4
-  ; MM32:         not16   $3, $5
-
   %r = xor i64 %a, -1
   ret i64 %r
 }
 
 define signext i128 @not_i128(i128 signext %a) {
+; GP32-LABEL: not_i128:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    not $2, $4
+; GP32-NEXT:    not $3, $5
+; GP32-NEXT:    not $4, $6
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    not $5, $7
+;
+; GP64-LABEL: not_i128:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    daddiu $1, $zero, -1
+; GP64-NEXT:    xor $2, $4, $1
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    xor $3, $5, $1
+;
+; MM-LABEL: not_i128:
+; MM:       # %bb.0: # %entry
+; MM-NEXT:    not16 $2, $4
+; MM-NEXT:    not16 $3, $5
+; MM-NEXT:    not16 $4, $6
+; MM-NEXT:    not16 $5, $7
+; MM-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: not_i128:
-
-  ; GP32:         not     $2, $4
-  ; GP32:         not     $3, $5
-  ; GP32:         not     $4, $6
-  ; GP32:         not     $5, $7
-
-  ; GP64:         daddiu  $[[T0:[0-9]+]], $zero, -1
-  ; GP64:         xor     $2, $4, $[[T0]]
-  ; GP64:         xor     $3, $5, $[[T0]]
-
-  ; MM32:         not16   $2, $4
-  ; MM32:         not16   $3, $5
-  ; MM32:         not16   $4, $6
-  ; MM32:         not16   $5, $7
-
   %r = xor i128 %a, -1
   ret i128 %r
 }
 
 define signext i1 @nor_i1(i1 signext %a, i1 signext %b) {
+; GP32-LABEL: nor_i1:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    nor $2, $5, $4
+;
+; GP64-LABEL: nor_i1:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    or $1, $5, $4
+; GP64-NEXT:    sll $1, $1, 0
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    not $2, $1
+;
+; MM32r3-LABEL: nor_i1:
+; MM32r3:       # %bb.0: # %entry
+; MM32r3-NEXT:    jr $ra
+; MM32r3-NEXT:    nor $2, $5, $4
+;
+; MM32r6-LABEL: nor_i1:
+; MM32r6:       # %bb.0: # %entry
+; MM32r6-NEXT:    nor $2, $5, $4
+; MM32r6-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: nor_i1:
-
-  ; GP32:         nor     $2, $5, $4
-  ; GP64:         or      $1, $5, $4
-  ; MM32:         nor     $2, $5, $4
-
   %or = or i1 %b, %a
   %r = xor i1 %or, -1
   ret i1 %r
 }
 
 define signext i8 @nor_i8(i8 signext %a, i8 signext %b) {
+; GP32-LABEL: nor_i8:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    nor $2, $5, $4
+;
+; GP64-LABEL: nor_i8:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    or $1, $5, $4
+; GP64-NEXT:    sll $1, $1, 0
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    not $2, $1
+;
+; MM32r3-LABEL: nor_i8:
+; MM32r3:       # %bb.0: # %entry
+; MM32r3-NEXT:    jr $ra
+; MM32r3-NEXT:    nor $2, $5, $4
+;
+; MM32r6-LABEL: nor_i8:
+; MM32r6:       # %bb.0: # %entry
+; MM32r6-NEXT:    nor $2, $5, $4
+; MM32r6-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: nor_i8:
-
-  ; GP32:         nor     $2, $5, $4
-  ; GP64:         or      $1, $5, $4
-  ; MM32:         nor     $2, $5, $4
-
   %or = or i8 %b, %a
   %r = xor i8 %or, -1
   ret i8 %r
 }
 
 define signext i16 @nor_i16(i16 signext %a, i16 signext %b) {
+; GP32-LABEL: nor_i16:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    nor $2, $5, $4
+;
+; GP64-LABEL: nor_i16:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    or $1, $5, $4
+; GP64-NEXT:    sll $1, $1, 0
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    not $2, $1
+;
+; MM32r3-LABEL: nor_i16:
+; MM32r3:       # %bb.0: # %entry
+; MM32r3-NEXT:    jr $ra
+; MM32r3-NEXT:    nor $2, $5, $4
+;
+; MM32r6-LABEL: nor_i16:
+; MM32r6:       # %bb.0: # %entry
+; MM32r6-NEXT:    nor $2, $5, $4
+; MM32r6-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: nor_i16:
-
-  ; GP32:         nor     $2, $5, $4
-  ; GP64:         or      $1, $5, $4
-  ; MM32:         nor     $2, $5, $4
-
   %or = or i16 %b, %a
   %r = xor i16 %or, -1
   ret i16 %r
 }
 
 define signext i32 @nor_i32(i32 signext %a, i32 signext %b) {
+; GP32-LABEL: nor_i32:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    nor $2, $5, $4
+;
+; GP64-LABEL: nor_i32:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    nor $2, $5, $4
+;
+; MM32r3-LABEL: nor_i32:
+; MM32r3:       # %bb.0: # %entry
+; MM32r3-NEXT:    jr $ra
+; MM32r3-NEXT:    nor $2, $5, $4
+;
+; MM32r6-LABEL: nor_i32:
+; MM32r6:       # %bb.0: # %entry
+; MM32r6-NEXT:    nor $2, $5, $4
+; MM32r6-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: nor_i32:
-
-  ; GP32:         nor     $2, $5, $4
-
-  ; GP64:         or      $[[T0:[0-9]+]], $5, $4
-  ; GP64:         sll     $[[T1:[0-9]+]], $[[T0]], 0
-  ; GP64:         not     $[[T2:[0-9]+]], $[[T1]]
-  ; GP64:         sll     $2, $[[T2]], 0
-
-  ; MM32:         nor     $2, $5, $4
-
   %or = or i32 %b, %a
   %r = xor i32 %or, -1
   ret i32 %r
 }
 
-
 define signext i64 @nor_i64(i64 signext %a, i64 signext %b) {
+; GP32-LABEL: nor_i64:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    nor $2, $6, $4
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    nor $3, $7, $5
+;
+; GP64-LABEL: nor_i64:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    nor $2, $5, $4
+;
+; MM32r3-LABEL: nor_i64:
+; MM32r3:       # %bb.0: # %entry
+; MM32r3-NEXT:    nor $2, $6, $4
+; MM32r3-NEXT:    jr $ra
+; MM32r3-NEXT:    nor $3, $7, $5
+;
+; MM32r6-LABEL: nor_i64:
+; MM32r6:       # %bb.0: # %entry
+; MM32r6-NEXT:    nor $2, $6, $4
+; MM32r6-NEXT:    nor $3, $7, $5
+; MM32r6-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: nor_i64:
-
-  ; GP32:         nor     $2, $6, $4
-  ; GP32:         nor     $3, $7, $5
-
-  ; GP64:         nor     $2, $5, $4
-
-  ; MM32:         nor     $2, $6, $4
-  ; MM32:         nor     $3, $7, $5
-
   %or = or i64 %b, %a
   %r = xor i64 %or, -1
   ret i64 %r
 }
 
 define signext i128 @nor_i128(i128 signext %a, i128 signext %b) {
+; GP32-LABEL: nor_i128:
+; GP32:       # %bb.0: # %entry
+; GP32-NEXT:    lw $1, 20($sp)
+; GP32-NEXT:    lw $2, 16($sp)
+; GP32-NEXT:    nor $2, $2, $4
+; GP32-NEXT:    nor $3, $1, $5
+; GP32-NEXT:    lw $1, 24($sp)
+; GP32-NEXT:    nor $4, $1, $6
+; GP32-NEXT:    lw $1, 28($sp)
+; GP32-NEXT:    jr $ra
+; GP32-NEXT:    nor $5, $1, $7
+;
+; GP64-LABEL: nor_i128:
+; GP64:       # %bb.0: # %entry
+; GP64-NEXT:    nor $2, $6, $4
+; GP64-NEXT:    jr $ra
+; GP64-NEXT:    nor $3, $7, $5
+;
+; MM32r3-LABEL: nor_i128:
+; MM32r3:       # %bb.0: # %entry
+; MM32r3-NEXT:    lw $1, 20($sp)
+; MM32r3-NEXT:    lw $2, 16($sp)
+; MM32r3-NEXT:    nor $2, $2, $4
+; MM32r3-NEXT:    nor $3, $1, $5
+; MM32r3-NEXT:    lw $1, 24($sp)
+; MM32r3-NEXT:    nor $4, $1, $6
+; MM32r3-NEXT:    lw $1, 28($sp)
+; MM32r3-NEXT:    jr $ra
+; MM32r3-NEXT:    nor $5, $1, $7
+;
+; MM32r6-LABEL: nor_i128:
+; MM32r6:       # %bb.0: # %entry
+; MM32r6-NEXT:    lw $1, 20($sp)
+; MM32r6-NEXT:    lw $2, 16($sp)
+; MM32r6-NEXT:    nor $2, $2, $4
+; MM32r6-NEXT:    nor $3, $1, $5
+; MM32r6-NEXT:    lw $1, 24($sp)
+; MM32r6-NEXT:    nor $4, $1, $6
+; MM32r6-NEXT:    lw $1, 28($sp)
+; MM32r6-NEXT:    nor $5, $1, $7
+; MM32r6-NEXT:    jrc $ra
 entry:
-; ALL-LABEL: nor_i128:
-
-  ; GP32:         lw      $[[T1:[0-9]+]], 20($sp)
-  ; GP32:         lw      $[[T2:[0-9]+]], 16($sp)
-  ; GP32:         nor     $2, $[[T2]], $4
-  ; GP32:         nor     $3, $[[T1]], $5
-  ; GP32:         lw      $[[T0:[0-9]+]], 24($sp)
-  ; GP32:         nor     $4, $[[T0]], $6
-  ; GP32:         lw      $[[T3:[0-9]+]], 28($sp)
-  ; GP32:         nor     $5, $[[T3]], $7
-
-  ; GP64:         nor     $2, $6, $4
-  ; GP64:         nor     $3, $7, $5
-
-  ; MM32:         lw      $[[T1:[0-9]+]], 20($sp)
-  ; MM32:         lw      $[[T2:[0-9]+]], 16($sp)
-  ; MM32:         nor     $2, $[[T2]], $4
-  ; MM32:         nor     $3, $[[T1]], $5
-  ; MM32:         lw      $[[T0:[0-9]+]], 24($sp)
-  ; MM32:         nor     $4, $[[T0]], $6
-  ; MM32:         lw      $[[T3:[0-9]+]], 28($sp)
-  ; MM32:         nor     $5, $[[T3]], $7
-
   %or = or i128 %b, %a
   %r = xor i128 %or, -1
   ret i128 %r
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/xor.ll b/llvm/test/CodeGen/Mips/llvm-ir/xor.ll
index 972e3b6..ec9a204 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/xor.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/xor.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips2 | FileCheck %s -check-prefix=MIPS
 ; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32 | FileCheck %s -check-prefix=MIPS
 ; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r2 | FileCheck %s \
@@ -508,21 +507,18 @@ define signext i32 @xor_i32_4(i32 signext %b) {
 ;
 ; MIPS64-LABEL: xor_i32_4:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    xori $1, $4, 4
 ; MIPS64-NEXT:    jr $ra
-; MIPS64-NEXT:    sll $2, $1, 0
+; MIPS64-NEXT:    xori $2, $4, 4
 ;
 ; MIPS64R2-LABEL: xor_i32_4:
 ; MIPS64R2:       # %bb.0: # %entry
-; MIPS64R2-NEXT:    xori $1, $4, 4
 ; MIPS64R2-NEXT:    jr $ra
-; MIPS64R2-NEXT:    sll $2, $1, 0
+; MIPS64R2-NEXT:    xori $2, $4, 4
 ;
 ; MIPS64R6-LABEL: xor_i32_4:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    xori $1, $4, 4
 ; MIPS64R6-NEXT:    jr $ra
-; MIPS64R6-NEXT:    sll $2, $1, 0
+; MIPS64R6-NEXT:    xori $2, $4, 4
 ;
 ; MM32R3-LABEL: xor_i32_4:
 ; MM32R3:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/Mips/msa/arithmetic.ll b/llvm/test/CodeGen/Mips/msa/arithmetic.ll
index a262ce1..ad0493b 100644
--- a/llvm/test/CodeGen/Mips/msa/arithmetic.ll
+++ b/llvm/test/CodeGen/Mips/msa/arithmetic.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=ALL,MIPS
 ; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=ALL,MIPSEL
+; RUN: llc -mtriple=mips64 -mcpu=i6500 < %s | FileCheck %s --check-prefixes=ALL
+; RUN: llc -mtriple=mips64 -mcpu=i6400 < %s | FileCheck %s --check-prefixes=ALL
 
 define void @add_v16i8(ptr %c, ptr %a, ptr %b) nounwind {
 ; ALL-LABEL: add_v16i8:
diff --git a/llvm/test/CodeGen/Mips/xor-and.ll b/llvm/test/CodeGen/Mips/xor-and.ll
index 3a173ba..efe0af1f 100644
--- a/llvm/test/CodeGen/Mips/xor-and.ll
+++ b/llvm/test/CodeGen/Mips/xor-and.ll
@@ -1,17 +1,45 @@
-; RUN: llc -O3 -mcpu=mips64r6 -mtriple=mips64el-unknown-linux-gnuabi64 < %s -o - | FileCheck %s
+; RUN: llc -O3 -mcpu=mips64 -mtriple=mips64el-unknown-linux-gnuabi64 < %s -o - | FileCheck %s
 
-; This test shows the unoptimized result with unnecessary SLLs.
 define noundef signext i32 @xor_and(i32 noundef signext %a, i32 noundef signext %b) local_unnamed_addr {
 ; CHECK-LABEL: xor_and:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    and $1, $5, $4
-; CHECK-NEXT:    sll $1, $1, 0
-; CHECK-NEXT:    not $1, $1
+; CHECK-NEXT:    daddiu $2, $zero, -1
 ; CHECK-NEXT:    jr $ra
-; CHECK-NEXT:    sll $2, $1, 0
+; CHECK-NEXT:    xor $2, $1, $2
 
 entry:
   %0 = and i32 %b, %a
   %or1 = xor i32 %0, -1
   ret i32 %or1
 }
+
+define noundef signext i32 @input_i16(i16 noundef signext %a, i16 noundef signext %b) local_unnamed_addr {
+; CHECK-LABEL: input_i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    and $1, $5, $4
+; CHECK-NEXT:    daddiu $2, $zero, -1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    xor $2, $1, $2
+
+entry:
+  %0 = and i16 %b, %a
+  %1 = xor i16 %0, -1
+  %or4 = sext i16 %1 to i32
+  ret i32 %or4
+}
+
+define i64 @return_i64(i32 noundef signext %a, i32 noundef signext %b) local_unnamed_addr {
+; CHECK-LABEL: return_i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    and $1, $5, $4
+; CHECK-NEXT:    daddiu $2, $zero, -1
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    xor $2, $1, $2
+
+entry:
+  %0 = and i32 %b, %a
+  %or1 = xor i32 %0, -1
+  %conv = sext i32 %or1 to i64
+  ret i64 %conv
+}
diff --git a/llvm/test/CodeGen/NVPTX/NVPTXAA_before_BasicAA.ll b/llvm/test/CodeGen/NVPTX/NVPTXAA_before_BasicAA.ll
new file mode 100644
index 0000000..0d16b34
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/NVPTXAA_before_BasicAA.ll
@@ -0,0 +1,17 @@
+; REQUIRES: asserts
+; RUN: opt -aa-pipeline=default -passes='require<aa>' -debug-pass-manager -disable-output -S < %s 2>&1 | FileCheck %s
+; RUN: llc --debug-only='aa' -o /dev/null %s 2>&1 | FileCheck %s -check-prefix=LEGACY
+
+; In default AA pipeline, NVPTXAA should run before BasicAA to reduce compile time for NVPTX backend
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK: Running analysis: NVPTXAA on foo
+; CHECK-NEXT: Running analysis: BasicAA on foo
+
+; LEGACY: AAResults register Early ExternalAA: NVPTX Address space based Alias Analysis Wrapper
+; LEGACY-NEXT: AAResults register BasicAA
+define void @foo(){
+entry:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/RISCV/instruction-count-remark.mir b/llvm/test/CodeGen/RISCV/instruction-count-remark.mir
index 4f429ab..f39a6ea 100644
--- a/llvm/test/CodeGen/RISCV/instruction-count-remark.mir
+++ b/llvm/test/CodeGen/RISCV/instruction-count-remark.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=riscv32 -verify-machineinstrs -start-before=riscv-expand-pseudo -simplify-mir -o /dev/null -pass-remarks-analysis=asm-printer %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=riscv32 -verify-machineinstrs -start-before=riscv-asm-printer -simplify-mir -o /dev/null -pass-remarks-analysis=asm-printer %s 2>&1 | FileCheck %s
 ---
 name: instrs
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f6bdd45..8ac4c744 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -176,6 +176,241 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i
   ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7
 }
 
+define {<4 x i32>, <4 x i32>} @vpload_factor2(ptr %ptr) {
+; CHECK-LABEL: vpload_factor2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8)
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
+}
+
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+; We only extract some of the fields.
+define {<4 x i32>, <4 x i32>} @vpload_factor3_partial(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_partial:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} poison, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v2, 1
+  ret {<4 x i32>, <4 x i32>} %res1
+}
+
+; Load a larger vector but only deinterleave a subset of the elements.
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, i32 12)
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+; Make sure the mask is propagated.
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 10
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+; Poison/undef in the shuffle mask shouldn't affect anything.
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_poison_shufflemask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 10
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
+; CHECK-LABEL: vpload_factor4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+  ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3
+}
+
+; TODO: Add more tests for vp.load/store + (de)interleave intrinsics with fixed vectors.
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vpload_factor4_intrinsics(ptr %ptr) {
+; CHECK-LABEL: vpload_factor4_intrinsics:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8)
+  %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
+  %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
+  %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
+  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
+  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
+  %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
+  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
+  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
+
+  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
+  %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
+  %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
+  %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) {
+; CHECK-LABEL: vpload_factor5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20)
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
+  %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
+  %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
+  %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+  %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
+  ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+}
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor6(ptr %ptr) {
+; CHECK-LABEL: vpload_factor6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i16> @llvm.vp.load.v12i16.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
+  %v1 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 1, i32 7>
+  %v2 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 2, i32 8>
+  %v3 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 3, i32 9>
+  %v4 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 4, i32 10>
+  %v5 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 5, i32 11>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5
+}
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor7(ptr %ptr) {
+; CHECK-LABEL: vpload_factor7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <14 x i16> @llvm.vp.load.v14i16.p0(ptr %ptr, <14 x i1> splat (i1 true), i32 14)
+  %v0 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
+  %v1 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 1, i32 8>
+  %v2 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 2, i32 9>
+  %v3 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 3, i32 10>
+  %v4 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 4, i32 11>
+  %v5 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 5, i32 12>
+  %v6 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 6, i32 13>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6
+}
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor8(ptr %ptr) {
+; CHECK-LABEL: vpload_factor8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <16 x i16> @llvm.vp.load.v16i16.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  %v0 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 0, i32 8>
+  %v1 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 1, i32 9>
+  %v2 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 2, i32 10>
+  %v3 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 3, i32 11>
+  %v4 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 4, i32 12>
+  %v5 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 5, i32 13>
+  %v6 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 6, i32 14>
+  %v7 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 7, i32 15>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6
+  %res7 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6, <2 x i16> %v7, 7
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7
+}
+
 ; LMUL * NF is > 8 here and so shouldn't be lowered to a vlseg
 define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) {
 ; RV32-LABEL: load_factor6_too_big:
@@ -192,8 +427,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI8_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI8_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI20_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI20_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -278,12 +513,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI8_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI8_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI20_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI20_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -462,8 +697,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI8_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -527,16 +762,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI8_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI20_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI20_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -564,14 +799,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI8_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI20_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI20_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -658,8 +893,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI8_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI8_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI20_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI20_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -847,8 +1082,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI8_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI8_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI20_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI20_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -882,8 +1117,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI8_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI8_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI20_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI20_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -967,12 +1202,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI8_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI20_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI8_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI20_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1023,8 +1258,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI8_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI20_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1196,6 +1431,154 @@ define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2
   ret void
 }
 
+define void @store_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) {
+; CHECK-LABEL: store_factor7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg7e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
+  store <14 x i16> %interleaved.vec, ptr %ptr
+  ret void
+}
+
+define void @store_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) {
+; CHECK-LABEL: store_factor8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <16 x i16> %interleaved.vec, ptr %ptr
+  ret void
+}
+
+define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: vpstore_factor2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg2e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  tail call void @llvm.vp.store.v8i32.p0(<8 x i32> %interleaved.vec, ptr %ptr, <8 x i1> splat (i1 true), i32 8)
+  ret void
+}
+
+define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: vpstore_factor3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  ret void
+}
+
+define void @vpstore_factor3_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: vpstore_factor3_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vsseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, i32 12)
+  ret void
+}
+
+define void @vpstore_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-LABEL: vpstore_factor4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  ret void
+}
+
+define void @vpstore_factor5(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
+; CHECK-LABEL: vpstore_factor5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg5e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s2 = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s3 = shufflevector <4 x i32> %v4, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s2, <16 x i32> %s3, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19>
+  tail call void @llvm.vp.store.v20i32.p0(<20 x i32> %interleaved.vec, ptr %ptr, <20 x i1> splat (i1 true), i32 20)
+  ret void
+}
+
+define void @vpstore_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5) {
+; CHECK-LABEL: vpstore_factor6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s3 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i16> %s2, <8 x i16> %s3, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  tail call void @llvm.vp.store.v12i16.p0(<12 x i16> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  ret void
+}
+
+define void @vpstore_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) {
+; CHECK-LABEL: vpstore_factor7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg7e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
+  tail call void @llvm.vp.store.v14i16.p0(<14 x i16> %interleaved.vec, ptr %ptr, <14 x i1> splat (i1 true), i32 14)
+  ret void
+}
+
+define void @vpstore_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) {
+; CHECK-LABEL: vpstore_factor8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  tail call void @llvm.vp.store.v16i16.p0(<16 x i16> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  ret void
+}
 
 define <4 x i32> @load_factor2_one_active(ptr %ptr) {
 ; CHECK-LABEL: load_factor2_one_active:
@@ -1368,3 +1751,157 @@ define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) {
   store <16 x i32> %v0, ptr %ptr
   ret void
 }
+
+; Negative tests
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
+; RV32-LABEL: invalid_vp_mask:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 73
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v11, a1
+; RV32-NEXT:    lui a1, 1
+; RV32-NEXT:    vmv.v.i v10, 8
+; RV32-NEXT:    addi a1, a1, -43
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 146
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0), v0.t
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI49_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI49_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v21, (a1)
+; RV32-NEXT:    vcompress.vm v8, v12, v11
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vcompress.vm v14, v12, v20
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v21
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: invalid_vp_mask:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 73
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v11, a1
+; RV64-NEXT:    li a1, 146
+; RV64-NEXT:    vmv.s.x v20, a1
+; RV64-NEXT:    lui a1, 1
+; RV64-NEXT:    vmv.v.i v10, 8
+; RV64-NEXT:    addi a1, a1, -43
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 36
+; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0), v0.t
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v11
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vcompress.vm v14, v12, v20
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
+; RV32-LABEL: invalid_vp_evl:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0)
+; RV32-NEXT:    li a0, 73
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 8
+; RV32-NEXT:    vmv.s.x v10, a0
+; RV32-NEXT:    li a0, 146
+; RV32-NEXT:    vmv.s.x v11, a0
+; RV32-NEXT:    lui a0, %hi(.LCPI50_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI50_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v20, (a0)
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vcompress.vm v8, v12, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vcompress.vm v14, v12, v11
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v20
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: invalid_vp_evl:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 10, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0)
+; RV64-NEXT:    li a0, 73
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 8
+; RV64-NEXT:    vmv.s.x v10, a0
+; RV64-NEXT:    li a0, 146
+; RV64-NEXT:    vmv.s.x v11, a0
+; RV64-NEXT:    li a0, 36
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vcompress.vm v14, v12, v11
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 10)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
index 25192ea..edc9886 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
@@ -1,21 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT64
 
 define i32 @vqdot_vv(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: vqdot_vv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vsext.vf2 v14, v9
-; CHECK-NEXT:    vwmul.vv v8, v12, v14
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v12
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; NODOT-LABEL: vqdot_vv:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; NODOT-NEXT:    vsext.vf2 v12, v8
+; NODOT-NEXT:    vsext.vf2 v14, v9
+; NODOT-NEXT:    vwmul.vv v8, v12, v14
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.s.x v12, zero
+; NODOT-NEXT:    vredsum.vs v8, v8, v12
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdot_vv:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdot.vv v10, v8, v9
+; DOT-NEXT:    vmv.s.x v8, zero
+; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vmv.x.s a0, v8
+; DOT-NEXT:    ret
 entry:
   %a.sext = sext <16 x i8> %a to <16 x i32>
   %b.sext = sext <16 x i8> %b to <16 x i32>
@@ -63,17 +73,27 @@ entry:
 }
 
 define i32 @vqdotu_vv(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: vqdotu_vv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vwmulu.vv v10, v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vwredsumu.vs v8, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; NODOT-LABEL: vqdotu_vv:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; NODOT-NEXT:    vwmulu.vv v10, v8, v9
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.s.x v8, zero
+; NODOT-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; NODOT-NEXT:    vwredsumu.vs v8, v10, v8
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotu_vv:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdotu.vv v10, v8, v9
+; DOT-NEXT:    vmv.s.x v8, zero
+; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vmv.x.s a0, v8
+; DOT-NEXT:    ret
 entry:
   %a.zext = zext <16 x i8> %a to <16 x i32>
   %b.zext = zext <16 x i8> %b to <16 x i32>
@@ -102,17 +122,27 @@ entry:
 }
 
 define i32 @vqdotsu_vv(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: vqdotsu_vv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vzext.vf2 v14, v9
-; CHECK-NEXT:    vwmulsu.vv v8, v12, v14
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v12
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; NODOT-LABEL: vqdotsu_vv:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; NODOT-NEXT:    vsext.vf2 v12, v8
+; NODOT-NEXT:    vzext.vf2 v14, v9
+; NODOT-NEXT:    vwmulsu.vv v8, v12, v14
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.s.x v12, zero
+; NODOT-NEXT:    vredsum.vs v8, v8, v12
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotsu_vv:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdotsu.vv v10, v8, v9
+; DOT-NEXT:    vmv.s.x v8, zero
+; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vmv.x.s a0, v8
+; DOT-NEXT:    ret
 entry:
   %a.sext = sext <16 x i8> %a to <16 x i32>
   %b.zext = zext <16 x i8> %b to <16 x i32>
@@ -122,17 +152,27 @@ entry:
 }
 
 define i32 @vqdotsu_vv_swapped(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: vqdotsu_vv_swapped:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v8
-; CHECK-NEXT:    vzext.vf2 v14, v9
-; CHECK-NEXT:    vwmulsu.vv v8, v12, v14
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v12
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; NODOT-LABEL: vqdotsu_vv_swapped:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; NODOT-NEXT:    vsext.vf2 v12, v8
+; NODOT-NEXT:    vzext.vf2 v14, v9
+; NODOT-NEXT:    vwmulsu.vv v8, v12, v14
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.s.x v12, zero
+; NODOT-NEXT:    vredsum.vs v8, v8, v12
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotsu_vv_swapped:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdotsu.vv v10, v8, v9
+; DOT-NEXT:    vmv.s.x v8, zero
+; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vmv.x.s a0, v8
+; DOT-NEXT:    ret
 entry:
   %a.sext = sext <16 x i8> %a to <16 x i32>
   %b.zext = zext <16 x i8> %b to <16 x i32>
@@ -181,31 +221,262 @@ entry:
 }
 
 define i32 @reduce_of_sext(<16 x i8> %a) {
-; CHECK-LABEL: reduce_of_sext:
+; NODOT-LABEL: reduce_of_sext:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT:    vsext.vf4 v12, v8
+; NODOT-NEXT:    vmv.s.x v8, zero
+; NODOT-NEXT:    vredsum.vs v8, v12, v8
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT32-LABEL: reduce_of_sext:
+; DOT32:       # %bb.0: # %entry
+; DOT32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT32-NEXT:    vmv.v.i v9, 0
+; DOT32-NEXT:    lui a0, 4112
+; DOT32-NEXT:    addi a0, a0, 257
+; DOT32-NEXT:    vqdot.vx v9, v8, a0
+; DOT32-NEXT:    vmv.s.x v8, zero
+; DOT32-NEXT:    vredsum.vs v8, v9, v8
+; DOT32-NEXT:    vmv.x.s a0, v8
+; DOT32-NEXT:    ret
+;
+; DOT64-LABEL: reduce_of_sext:
+; DOT64:       # %bb.0: # %entry
+; DOT64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT64-NEXT:    vmv.v.i v9, 0
+; DOT64-NEXT:    lui a0, 4112
+; DOT64-NEXT:    addiw a0, a0, 257
+; DOT64-NEXT:    vqdot.vx v9, v8, a0
+; DOT64-NEXT:    vmv.s.x v8, zero
+; DOT64-NEXT:    vredsum.vs v8, v9, v8
+; DOT64-NEXT:    vmv.x.s a0, v8
+; DOT64-NEXT:    ret
+entry:
+  %a.ext = sext <16 x i8> %a to <16 x i32>
+  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext)
+  ret i32 %res
+}
+
+define i32 @reduce_of_zext(<16 x i8> %a) {
+; NODOT-LABEL: reduce_of_zext:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT:    vzext.vf4 v12, v8
+; NODOT-NEXT:    vmv.s.x v8, zero
+; NODOT-NEXT:    vredsum.vs v8, v12, v8
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT32-LABEL: reduce_of_zext:
+; DOT32:       # %bb.0: # %entry
+; DOT32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT32-NEXT:    vmv.v.i v9, 0
+; DOT32-NEXT:    lui a0, 4112
+; DOT32-NEXT:    addi a0, a0, 257
+; DOT32-NEXT:    vqdotu.vx v9, v8, a0
+; DOT32-NEXT:    vmv.s.x v8, zero
+; DOT32-NEXT:    vredsum.vs v8, v9, v8
+; DOT32-NEXT:    vmv.x.s a0, v8
+; DOT32-NEXT:    ret
+;
+; DOT64-LABEL: reduce_of_zext:
+; DOT64:       # %bb.0: # %entry
+; DOT64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT64-NEXT:    vmv.v.i v9, 0
+; DOT64-NEXT:    lui a0, 4112
+; DOT64-NEXT:    addiw a0, a0, 257
+; DOT64-NEXT:    vqdotu.vx v9, v8, a0
+; DOT64-NEXT:    vmv.s.x v8, zero
+; DOT64-NEXT:    vredsum.vs v8, v9, v8
+; DOT64-NEXT:    vmv.x.s a0, v8
+; DOT64-NEXT:    ret
+entry:
+  %a.ext = zext <16 x i8> %a to <16 x i32>
+  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext)
+  ret i32 %res
+}
+
+define i32 @vqdot_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) {
+; CHECK-LABEL: vqdot_vv_accum:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vsext.vf4 v12, v8
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v10, v8
+; CHECK-NEXT:    vsext.vf2 v16, v9
+; CHECK-NEXT:    vwmacc.vv v12, v10, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    vredsum.vs v8, v12, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
-  %a.ext = sext <16 x i8> %a to <16 x i32>
-  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext)
-  ret i32 %res
+  %a.sext = sext <16 x i8> %a to <16 x i32>
+  %b.sext = sext <16 x i8> %b to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %a.sext, %b.sext
+  %add = add <16 x i32> %mul, %x
+  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %add)
+  ret i32 %sum
 }
 
-define i32 @reduce_of_zext(<16 x i8> %a) {
-; CHECK-LABEL: reduce_of_zext:
+define i32 @vqdotu_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) {
+; CHECK-LABEL: vqdotu_vv_accum:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwaddu.wv v12, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    vredsum.vs v8, v12, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
-  %a.ext = zext <16 x i8> %a to <16 x i32>
-  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext)
-  ret i32 %res
+  %a.zext = zext <16 x i8> %a to <16 x i32>
+  %b.zext = zext <16 x i8> %b to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %a.zext, %b.zext
+  %add = add <16 x i32> %mul, %x
+  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %add)
+  ret i32 %sum
+}
+
+define i32 @vqdotsu_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) {
+; CHECK-LABEL: vqdotsu_vv_accum:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v10, v8
+; CHECK-NEXT:    vzext.vf2 v16, v9
+; CHECK-NEXT:    vwmaccsu.vv v12, v10, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vredsum.vs v8, v12, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <16 x i8> %a to <16 x i32>
+  %b.zext = zext <16 x i8> %b to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %a.sext, %b.zext
+  %add = add <16 x i32> %mul, %x
+  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %add)
+  ret i32 %sum
+}
+
+define i32 @vqdot_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) {
+; NODOT-LABEL: vqdot_vv_scalar_add:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; NODOT-NEXT:    vsext.vf2 v12, v8
+; NODOT-NEXT:    vsext.vf2 v14, v9
+; NODOT-NEXT:    vwmul.vv v8, v12, v14
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.s.x v12, a0
+; NODOT-NEXT:    vredsum.vs v8, v8, v12
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdot_vv_scalar_add:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdot.vv v10, v8, v9
+; DOT-NEXT:    vmv.s.x v8, a0
+; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vmv.x.s a0, v8
+; DOT-NEXT:    ret
+entry:
+  %a.sext = sext <16 x i8> %a to <16 x i32>
+  %b.sext = sext <16 x i8> %b to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %a.sext, %b.sext
+  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
+  %add = add i32 %sum, %x
+  ret i32 %add
+}
+
+define i32 @vqdotu_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) {
+; NODOT-LABEL: vqdotu_vv_scalar_add:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; NODOT-NEXT:    vwmulu.vv v10, v8, v9
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.s.x v8, a0
+; NODOT-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; NODOT-NEXT:    vwredsumu.vs v8, v10, v8
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotu_vv_scalar_add:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdotu.vv v10, v8, v9
+; DOT-NEXT:    vmv.s.x v8, a0
+; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vmv.x.s a0, v8
+; DOT-NEXT:    ret
+entry:
+  %a.zext = zext <16 x i8> %a to <16 x i32>
+  %b.zext = zext <16 x i8> %b to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %a.zext, %b.zext
+  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
+  %add = add i32 %sum, %x
+  ret i32 %add
+}
+
+define i32 @vqdotsu_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) {
+; NODOT-LABEL: vqdotsu_vv_scalar_add:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; NODOT-NEXT:    vsext.vf2 v12, v8
+; NODOT-NEXT:    vzext.vf2 v14, v9
+; NODOT-NEXT:    vwmulsu.vv v8, v12, v14
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vmv.s.x v12, a0
+; NODOT-NEXT:    vredsum.vs v8, v8, v12
+; NODOT-NEXT:    vmv.x.s a0, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotsu_vv_scalar_add:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdotsu.vv v10, v8, v9
+; DOT-NEXT:    vmv.s.x v8, a0
+; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vmv.x.s a0, v8
+; DOT-NEXT:    ret
+entry:
+  %a.sext = sext <16 x i8> %a to <16 x i32>
+  %b.zext = zext <16 x i8> %b to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %a.sext, %b.zext
+  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
+  %add = add i32 %sum, %x
+  ret i32 %add
+}
+
+define i32 @vqdot_vv_split(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: vqdot_vv_split:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v12, v8
+; CHECK-NEXT:    vsext.vf2 v14, v9
+; CHECK-NEXT:    vsext.vf2 v16, v10
+; CHECK-NEXT:    vsext.vf2 v18, v11
+; CHECK-NEXT:    vwmul.vv v8, v12, v14
+; CHECK-NEXT:    vwmacc.vv v8, v16, v18
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v12
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <16 x i8> %a to <16 x i32>
+  %b.sext = sext <16 x i8> %b to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %a.sext, %b.sext
+  %c.sext = sext <16 x i8> %c to <16 x i32>
+  %d.sext = sext <16 x i8> %d to <16 x i32>
+  %mul2 = mul nuw nsw <16 x i32> %c.sext, %d.sext
+  %add = add <16 x i32> %mul, %mul2
+  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %add)
+  ret i32 %sum
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index d6e1af5..d0f35aa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -616,59 +616,6 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
   ret void
 }
 
-; We only support scalable vectors for now.
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) {
-; RV32-LABEL: not_scalable_vectors:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wx v12, v8, a0
-; RV32-NEXT:    vnsrl.wi v11, v8, 0
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vnsrl.wx v10, v11, a0
-; RV32-NEXT:    vnsrl.wi v8, v11, 0
-; RV32-NEXT:    vnsrl.wx v11, v12, a0
-; RV32-NEXT:    vnsrl.wi v9, v12, 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_scalable_vectors:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v12, v8, a0
-; RV64-NEXT:    vnsrl.wi v11, v8, 0
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vnsrl.wx v10, v11, a0
-; RV64-NEXT:    vnsrl.wi v8, v11, 0
-; RV64-NEXT:    vnsrl.wx v11, v12, a0
-; RV64-NEXT:    vnsrl.wi v9, v12, 0
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
-  %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
-  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
-  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
-  %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
-  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
-  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
-
-  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
-  %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
-  %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
-  %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
-}
-
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_same_mask:
 ; RV32:       # %bb.0:
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
index fc8faa73..f539fdefa 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
@@ -11,17 +11,18 @@ declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handle
 
 ; CHECK: OpDecorate [[BufferVar:%.+]] DescriptorSet 0
 ; CHECK: OpDecorate [[BufferVar]] Binding 0
-; CHECK: OpDecorate [[BufferType:%.+]] Block
-; CHECK: OpMemberDecorate [[BufferType]] 0 Offset 0
+; CHECK: OpMemberDecorate [[BufferType:%.+]] 0 Offset 0
+; CHECK: OpDecorate [[BufferType]] Block
 ; CHECK: OpMemberDecorate [[BufferType]] 0 NonWritable
 ; CHECK: OpDecorate [[RWBufferVar:%.+]] DescriptorSet 0
 ; CHECK: OpDecorate [[RWBufferVar]] Binding 1
-; CHECK: OpDecorate [[RWBufferType:%.+]] Block
-; CHECK: OpMemberDecorate [[RWBufferType]] 0 Offset 0
+; CHECK: OpDecorate [[ArrayType:%.+]] ArrayStride 4
+; CHECK: OpMemberDecorate [[RWBufferType:%.+]] 0 Offset 0
+; CHECK: OpDecorate [[RWBufferType]] Block
 
 
 ; CHECK: [[int:%[0-9]+]] = OpTypeInt 32 0
-; CHECK: [[ArrayType:%.+]] = OpTypeRuntimeArray
+; CHECK: [[ArrayType]] = OpTypeRuntimeArray
 ; CHECK: [[RWBufferType]] = OpTypeStruct [[ArrayType]]
 ; CHECK: [[RWBufferPtrType:%.+]] = OpTypePointer StorageBuffer [[RWBufferType]]
 ; CHECK: [[BufferType]] = OpTypeStruct [[ArrayType]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/pointer-addrspacecast.ll b/llvm/test/CodeGen/SPIRV/pointers/pointer-addrspacecast.ll
new file mode 100644
index 0000000..4d5549d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/pointer-addrspacecast.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:                     %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:                   %[[#uint_0:]] = OpConstant %[[#uint]] 0
+; CHECK-DAG:                 %[[#ptr_uint:]] = OpTypePointer Private %[[#uint]]
+; CHECK-DAG:                      %[[#var:]] = OpVariable %[[#ptr_uint]] Private %[[#uint_0]]
+
+; CHECK-DAG:  OpName %[[#func_simple:]] "simple"
+; CHECK-DAG:  OpName %[[#func_chain:]] "chain"
+
+@global = internal addrspace(10) global i32 zeroinitializer
+
+define void @simple() {
+; CHECK: %[[#func_simple]] = OpFunction
+entry:
+  %ptr = getelementptr i32, ptr addrspace(10) @global, i32 0
+  %casted = addrspacecast ptr addrspace(10) %ptr to ptr
+  %val = load i32, ptr %casted
+; CHECK: %{{.*}} = OpLoad %[[#uint]] %[[#var]] Aligned 4
+  ret void
+}
+
+define void @chain() {
+; CHECK: %[[#func_chain]] = OpFunction
+entry:
+  %a = getelementptr i32, ptr addrspace(10) @global, i32 0
+  %b = addrspacecast ptr addrspace(10) %a to ptr
+  %c = getelementptr i32, ptr %b, i32 0
+  %d = addrspacecast ptr %c to ptr addrspace(10)
+  %e = addrspacecast ptr addrspace(10) %d to ptr
+
+  %val = load i32, ptr %e
+; CHECK: %{{.*}} = OpLoad %[[#uint]] %[[#var]] Aligned 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
new file mode 100644
index 0000000..93208c1
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
@@ -0,0 +1,54 @@
+; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+; FIXME(134119): enable-this once Offset decoration are added.
+; XFAIL: spirv-tools
+
+%S2 = type { { [10 x { i32, i32 } ] }, i32 }
+
+; CHECK-DAG:                     %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:                   %[[#uint_0:]] = OpConstant %[[#uint]] 0
+; CHECK-DAG:                   %[[#uint_1:]] = OpConstant %[[#uint]] 1
+; CHECK-DAG:                   %[[#uint_3:]] = OpConstant %[[#uint]] 3
+; CHECK-DAG:                  %[[#uint_10:]] = OpConstant %[[#uint]] 10
+; CHECK-DAG:                  %[[#uint_11:]] = OpConstant %[[#uint]] 11
+; CHECK-DAG:   %[[#ptr_StorageBuffer_uint:]] = OpTypePointer StorageBuffer %[[#uint]]
+
+; CHECK-DAG:       %[[#t_s2_s_a_s:]] = OpTypeStruct %[[#uint]] %[[#uint]]
+; CHECK-DAG:       %[[#t_s2_s_a:]] = OpTypeArray %[[#t_s2_s_a_s]] %[[#uint_10]]
+; CHECK-DAG:       %[[#t_s2_s:]] = OpTypeStruct %[[#t_s2_s_a]]
+; CHECK-DAG:       %[[#t_s2:]] = OpTypeStruct %[[#t_s2_s]] %[[#uint]]
+
+; CHECK-DAG: %[[#ptr_StorageBuffer_struct:]] = OpTypePointer StorageBuffer %[[#t_s2]]
+; CHECK-DAG:                     %[[#rarr:]] = OpTypeRuntimeArray %[[#t_s2]]
+; CHECK-DAG:              %[[#rarr_struct:]] = OpTypeStruct %[[#rarr]]
+; CHECK-DAG:       %[[#spirv_VulkanBuffer:]] = OpTypePointer StorageBuffer %[[#rarr_struct]]
+
+declare target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32, i32, i32, i32, i1)
+
+define void @main() "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" {
+entry:
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false)
+; CHECK:      %[[#resource:]] = OpVariable %[[#spirv_VulkanBuffer]] StorageBuffer
+
+  %ptr = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_Ss_12_1t(target("spirv.VulkanBuffer", [0 x %S2], 12, 1) %handle, i32 0)
+; CHECK: %[[#a:]] = OpCopyObject %[[#spirv_VulkanBuffer]] %[[#resource]]
+; CHECK: %[[#b:]] = OpAccessChain %[[#ptr_StorageBuffer_struct]] %[[#a:]] %[[#uint_0]] %[[#uint_0]]
+  %casted = addrspacecast ptr addrspace(11) %ptr to ptr
+
+; CHECK: %[[#ptr2:]] = OpInBoundsAccessChain %[[#ptr_StorageBuffer_uint]] %[[#b:]] %[[#uint_0]] %[[#uint_0]] %[[#uint_3]] %[[#uint_1]]
+  %ptr2 = getelementptr inbounds %S2, ptr %casted, i64 0, i32 0, i32 0, i32 3, i32 1
+
+; CHECK: OpStore %[[#ptr2]] %[[#uint_10]] Aligned 4
+  store i32 10, ptr %ptr2, align 4
+
+; Another store, but this time using LLVM's ability to load the first element
+; without an explicit GEP. The backend has to determine the ptr type and
+; generate the appropriate access chain.
+; CHECK: %[[#ptr3:]] = OpInBoundsAccessChain %[[#ptr_StorageBuffer_uint]] %[[#b:]] %[[#uint_0]] %[[#uint_0]] %[[#uint_0]] %[[#uint_0]]
+; CHECK: OpStore %[[#ptr3]] %[[#uint_11]] Aligned 4
+  store i32 11, ptr %casted, align 4
+  ret void
+}
+
+declare ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_S2s_12_1t(target("spirv.VulkanBuffer", [0 x %S2], 12, 1), i32)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
new file mode 100644
index 0000000..24a50c7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
@@ -0,0 +1,37 @@
+; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+; FIXME(134119): enable-this once Offset decoration are added.
+; XFAIL: spirv-tools
+
+%struct.S = type { i32 }
+
+; CHECK-DAG:                     %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:                   %[[#uint_0:]] = OpConstant %[[#uint]] 0
+; CHECK-DAG:                  %[[#uint_10:]] = OpConstant %[[#uint]] 10
+; CHECK-DAG:   %[[#ptr_StorageBuffer_uint:]] = OpTypePointer StorageBuffer %[[#uint]]
+; CHECK-DAG:                   %[[#struct:]] = OpTypeStruct %[[#uint]]
+; CHECK-DAG: %[[#ptr_StorageBuffer_struct:]] = OpTypePointer StorageBuffer %[[#struct]]
+; CHECK-DAG:                     %[[#rarr:]] = OpTypeRuntimeArray %[[#struct]]
+; CHECK-DAG:              %[[#rarr_struct:]] = OpTypeStruct %[[#rarr]]
+; CHECK-DAG:       %[[#spirv_VulkanBuffer:]] = OpTypePointer StorageBuffer %[[#rarr_struct]]
+
+declare target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32, i32, i32, i32, i1)
+
+define void @main() "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" {
+entry:
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false)
+; CHECK:      %[[#resource:]] = OpVariable %[[#spirv_VulkanBuffer]] StorageBuffer
+
+  %ptr = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) %handle, i32 0)
+; CHECK: %[[#a:]] = OpCopyObject %[[#spirv_VulkanBuffer]] %[[#resource]]
+; CHECK: %[[#b:]] = OpAccessChain %[[#ptr_StorageBuffer_struct]] %[[#a:]] %[[#uint_0]] %[[#uint_0]]
+; CHECK: %[[#c:]] = OpInBoundsAccessChain %[[#ptr_StorageBuffer_uint]] %[[#b:]] %[[#uint_0]]
+  %casted = addrspacecast ptr addrspace(11) %ptr to ptr
+
+; CHECK: OpStore %[[#c]] %[[#uint_10]] Aligned 4
+  store i32 10, ptr %casted, align 4
+  ret void
+}
+
+declare ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1), i32)
diff --git a/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll b/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
new file mode 100644
index 0000000..7303471
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
@@ -0,0 +1,149 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
+
+; CHECK-DAG: OpName [[ScalarBlock_var:%[0-9]+]] "__resource_p_12_{_u32[0]}_0_0"
+; CHECK-DAG: OpName [[buffer_var:%[0-9]+]] "__resource_p_12_{_{_{_u32_f32[3]}[10]}[0]}_0_0"
+; CHECK-DAG: OpName [[array_buffer_var:%[0-9]+]] "__resource_p_12_{_{_{_u32_f32[3]}[10]}[0]}[10]_0_0"
+
+; CHECK-DAG: OpMemberDecorate [[ScalarBlock:%[0-9]+]] 0 Offset 0
+; CHECK-DAG: OpDecorate [[ScalarBlock]] Block
+; CHECK-DAG: OpMemberDecorate [[ScalarBlock]] 0 NonWritable
+; CHECK-DAG: OpMemberDecorate [[T_explicit:%[0-9]+]] 0 Offset 0
+; CHECK-DAG: OpMemberDecorate [[T_explicit]] 1 Offset 16
+; CHECK-DAG: OpDecorate [[T_array_explicit:%[0-9]+]] ArrayStride 32
+; CHECK-DAG: OpMemberDecorate [[S_explicit:%[0-9]+]] 0 Offset 0
+; CHECK-DAG: OpDecorate [[S_array_explicit:%[0-9]+]] ArrayStride 320
+; CHECK-DAG: OpMemberDecorate [[block:%[0-9]+]] 0 Offset 0
+; CHECK-DAG: OpDecorate [[block]] Block
+; CHECK-DAG: OpMemberDecorate [[block]] 0 NonWritable
+
+; CHECK-DAG: [[float:%[0-9]+]] = OpTypeFloat 32
+; CHECK-DAG: [[v3f:%[0-9]+]] = OpTypeVector [[float]] 3
+; CHECK-DAG: [[uint:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[T:%[0-9]+]] = OpTypeStruct [[uint]] [[v3f]]
+; CHECK-DAG: [[T_explicit]] = OpTypeStruct [[uint]] [[v3f]]
+%struct.T = type { i32, <3 x float> }
+
+; CHECK-DAG: [[zero:%[0-9]+]] = OpConstant [[uint]] 0{{$}}
+; CHECK-DAG: [[one:%[0-9]+]] = OpConstant [[uint]] 1{{$}}
+; CHECK-DAG: [[ten:%[0-9]+]] = OpConstant [[uint]] 10
+; CHECK-DAG: [[T_array:%[0-9]+]] = OpTypeArray [[T]] [[ten]]
+; CHECK-DAG: [[S:%[0-9]+]] = OpTypeStruct [[T_array]]
+; CHECK-DAG: [[T_array_explicit]] = OpTypeArray [[T_explicit]] [[ten]]
+; CHECK-DAG: [[S_explicit]] = OpTypeStruct [[T_array_explicit]]
+%struct.S = type { [10 x %struct.T] }
+
+; CHECK-DAG: [[private_S_ptr:%[0-9]+]] = OpTypePointer Private [[S]]
+; CHECK-DAG: [[private_var:%[0-9]+]] = OpVariable [[private_S_ptr]] Private
+@private = internal addrspace(10) global %struct.S poison
+
+; CHECK-DAG: [[storagebuffer_S_ptr:%[0-9]+]] = OpTypePointer StorageBuffer [[S_explicit]]
+; CHECK-DAG: [[storage_buffer:%[0-9]+]] = OpVariable [[storagebuffer_S_ptr]] StorageBuffer
+@storage_buffer = internal addrspace(11) global %struct.S poison
+
+; CHECK-DAG: [[storagebuffer_int_ptr:%[0-9]+]] = OpTypePointer StorageBuffer [[uint]]
+; CHECK-DAG: [[ScalarBlock_array:%[0-9]+]] = OpTypeRuntimeArray [[uint]]
+; CHECK-DAG: [[ScalarBlock]] = OpTypeStruct [[ScalarBlock_array]]
+; CHECK-DAG: [[ScalarBlock_ptr:%[0-9]+]] = OpTypePointer StorageBuffer [[ScalarBlock]]
+; CHECK-DAG: [[ScalarBlock_var]] = OpVariable [[ScalarBlock_ptr]] StorageBuffer
+
+
+; CHECK-DAG: [[S_array_explicit]] = OpTypeRuntimeArray [[S_explicit]]
+; CHECK-DAG: [[block]] = OpTypeStruct [[S_array_explicit]]
+; CHECK-DAG: [[buffer_ptr:%[0-9]+]] = OpTypePointer StorageBuffer [[block]]
+; CHECK-DAG: [[buffer_var]] = OpVariable [[buffer_ptr]] StorageBuffer
+
+; CHECK-DAG: [[array_buffer:%[0-9]+]] = OpTypeArray [[block]] [[ten]]
+; CHECK-DAG: [[array_buffer_ptr:%[0-9]+]] = OpTypePointer StorageBuffer [[array_buffer]]
+; CHECK-DAG: [[array_buffer_var]] = OpVariable [[array_buffer_ptr]] StorageBuffer
+
+; CHECK: OpFunction [[uint]] None
+define external i32 @scalar_vulkan_buffer_load() {
+; CHECK-NEXT: OpLabel
+entry:
+; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[ScalarBlock_ptr]] [[ScalarBlock_var]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_int_ptr]] [[handle]] [[zero]] [[one]]
+  %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x i32], 12, 0) %handle, i32 1)
+
+; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[uint]] [[ptr]] Aligned 4
+  %1 = load i32, ptr addrspace(11) %0, align 4
+
+; CHECK-NEXT: OpReturnValue [[ld]]
+  ret i32 %1
+
+; CHECK-NEXT: OpFunctionEnd
+}
+
+; CHECK: OpFunction [[S]] None
+define external %struct.S @private_load() {
+; CHECK-NEXT: OpLabel
+entry:
+
+; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S]] [[private_var]] Aligned 4
+  %1 = load %struct.S, ptr addrspace(10) @private, align 4
+
+; CHECK-NEXT: OpReturnValue [[ld]]
+  ret %struct.S %1
+
+; CHECK-NEXT: OpFunctionEnd
+}
+
+; CHECK: OpFunction [[S]] None
+define external %struct.S @storage_buffer_load() {
+; CHECK-NEXT: OpLabel
+entry:
+
+; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S_explicit]] [[storage_buffer]] Aligned 4
+; CHECK-NEXT: [[copy:%[0-9]+]] = OpCopyLogical [[S]] [[ld]]
+  %1 = load %struct.S, ptr addrspace(11) @storage_buffer, align 4
+
+; CHECK-NEXT: OpReturnValue [[copy]]
+  ret %struct.S %1
+
+; CHECK-NEXT: OpFunctionEnd
+}
+
+; CHECK: OpFunction [[S]] None
+define external %struct.S @vulkan_buffer_load() {
+; CHECK-NEXT: OpLabel
+entry:
+; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[buffer_var]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_S_ptr]] [[handle]] [[zero]] [[one]]
+  %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) %handle, i32 1)
+
+; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S_explicit]] [[ptr]] Aligned 4
+; CHECK-NEXT: [[copy:%[0-9]+]] = OpCopyLogical [[S]] [[ld]]
+  %1 = load %struct.S, ptr addrspace(11) %0, align 4
+
+; CHECK-NEXT: OpReturnValue [[copy]]
+  ret %struct.S %1
+
+; CHECK-NEXT: OpFunctionEnd
+}
+
+; CHECK: OpFunction [[S]] None
+define external %struct.S @array_of_vulkan_buffers_load() {
+; CHECK-NEXT: OpLabel
+entry:
+; CHECK-NEXT: [[h:%[0-9]+]] = OpAccessChain [[buffer_ptr]] [[array_buffer_var]] [[one]]
+; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[h]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 1, i1 false)
+
+; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_S_ptr]] [[handle]] [[zero]] [[one]]
+  %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) %handle, i32 1)
+
+; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S_explicit]] [[ptr]] Aligned 4
+; CHECK-NEXT: [[copy:%[0-9]+]] = OpCopyLogical [[S]] [[ld]]
+  %1 = load %struct.S, ptr addrspace(11) %0, align 4
+
+; CHECK-NEXT: OpReturnValue [[copy]]
+  ret %struct.S %1
+
+; CHECK-NEXT: OpFunctionEnd
+}
diff --git a/llvm/test/CodeGen/X86/align-basic-block-sections.mir b/llvm/test/CodeGen/X86/align-basic-block-sections.mir
index 17a675f..02ccbcf 100644
--- a/llvm/test/CodeGen/X86/align-basic-block-sections.mir
+++ b/llvm/test/CodeGen/X86/align-basic-block-sections.mir
@@ -1,5 +1,5 @@
 # Check if the alignment directive is put on the correct place when the basic block section option is used.
-# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-after=bbsections-prepare  %s -o - | FileCheck %s -check-prefix=CHECK
+# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-before=x86-asm-printer %s -o - | FileCheck %s -check-prefix=CHECK
 
 # How to generate the input:
 # foo.c
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir b/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
index 8ac93c79..a49a4e2 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
@@ -1,5 +1,5 @@
 # Start after bbsections0-prepare and check that the BB address map is generated.
-# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-after=bbsections-prepare -basic-block-address-map %s -o - | FileCheck %s -check-prefix=CHECK
+# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-before=x86-asm-printer -basic-block-address-map %s -o - | FileCheck %s -check-prefix=CHECK
 
 # How to generate the input:
 # foo.cc
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir b/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir
index 967622a..e49ff14 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir
+++ b/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir
@@ -1,5 +1,5 @@
 # Start after bbsections0-prepare and check if the right code is generated.
-# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-after=bbsections-prepare  %s -o - | FileCheck %s -check-prefix=CHECK
+# RUN: llc -mtriple x86_64-unknown-linux-gnu -start-before=x86-asm-printer  %s -o - | FileCheck %s -check-prefix=CHECK
 
 
 # How to generate the input:
diff --git a/llvm/test/CodeGen/X86/mingw-comdats-xdata.ll b/llvm/test/CodeGen/X86/mingw-comdats-xdata.ll
index 1a60c15..0e914a8 100644
--- a/llvm/test/CodeGen/X86/mingw-comdats-xdata.ll
+++ b/llvm/test/CodeGen/X86/mingw-comdats-xdata.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=x86_64-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU
+; RUN: llc -mtriple=x86_64-pc-cygwin < %s | FileCheck %s --check-prefix=GNU
 ; RUN: llc -mtriple=x86_64-w64-windows-gnu < %s -filetype=obj | llvm-objdump - --headers | FileCheck %s --check-prefix=GNUOBJ
+; RUN: llc -mtriple=x86_64-pc-cygwin < %s -filetype=obj | llvm-objdump - --headers | FileCheck %s --check-prefix=GNUOBJ
 
 ; When doing GCC style comdats for MinGW, the .xdata sections don't have a normal comdat
 ; symbol attached, which requires a bit of adjustments for the assembler output.
diff --git a/llvm/test/CodeGen/X86/mingw-comdats.ll b/llvm/test/CodeGen/X86/mingw-comdats.ll
index bca8e12..71e9503 100644
--- a/llvm/test/CodeGen/X86/mingw-comdats.ll
+++ b/llvm/test/CodeGen/X86/mingw-comdats.ll
@@ -1,8 +1,11 @@
 ; RUN: llc -function-sections -mtriple=x86_64-windows-itanium < %s | FileCheck %s
 ; RUN: llc -function-sections -mtriple=x86_64-windows-msvc < %s | FileCheck %s
 ; RUN: llc -function-sections -mtriple=x86_64-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU
+; RUN: llc -function-sections -mtriple=x86_64-pc-cygwin < %s | FileCheck %s --check-prefix=GNU
 ; RUN: llc -function-sections -mtriple=i686-w64-windows-gnu < %s | FileCheck %s --check-prefix=GNU32
+; RUN: llc -function-sections -mtriple=i686-pc-cygwin < %s | FileCheck %s --check-prefix=GNU32
 ; RUN: llc -function-sections -mtriple=x86_64-w64-windows-gnu < %s -filetype=obj | llvm-objdump - --headers | FileCheck %s --check-prefix=GNUOBJ
+; RUN: llc -function-sections -mtriple=x86_64-pc-cygwin < %s -filetype=obj | llvm-objdump - --headers | FileCheck %s --check-prefix=GNUOBJ
 
 ; GCC and MSVC handle comdats completely differently. Make sure we do the right
 ; thing for each.
diff --git a/llvm/test/DebugInfo/COFF/asm.ll b/llvm/test/DebugInfo/COFF/asm.ll
index d873df2..cf440bd 100644
--- a/llvm/test/DebugInfo/COFF/asm.ll
+++ b/llvm/test/DebugInfo/COFF/asm.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -S --sr --codeview - | FileCheck --check-prefix=OBJ32 %s
 ; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -O0 < %s | FileCheck --check-prefix=X64 %s
 ; RUN: llc -mcpu=core2 -mtriple=x86_64-pc-win32 -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -S --sr --codeview - | FileCheck --check-prefix=OBJ64 %s
+; RUN: llc -mcpu=core2 -mtriple=x86_64-uefi -O0 < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc -mcpu=core2 -mtriple=x86_64-uefi -o - -O0 < %s | llvm-mc -triple=x86_64-pc-win32 -filetype=obj | llvm-readobj -S --sr --codeview - | FileCheck --check-prefix=OBJ64 %s
 
 ; This LL file was generated by running clang on the following code:
 ; D:\asm.c:
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/inline-nodbg.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/inline-nodbg.ll
new file mode 100644
index 0000000..33f7f67
--- /dev/null
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/inline-nodbg.ll
@@ -0,0 +1,43 @@
+; RUN: opt %s -passes=inline -S -o - | FileCheck %s
+
+;; $ cat test.cpp
+;; int g;
+;; [[clang::always_inline, gnu::nodebug]]  void a() { g = 1; }
+;; void b() { a(); }
+;;
+;; Check the inlined instructions don't inherit the call's atom info.
+;; FIXME: Perhaps we want to do actually do that, to preserve existing
+;; behaviour? Unclear what's best.
+
+; CHECK: _Z1bv()
+; CHECK: store i32 1, ptr @g, align 4, !dbg [[DBG:!.*]]
+; CHECK: [[DBG]] = !DILocation(line: 3, scope: ![[#]])
+
+@g = hidden global i32 0, align 4
+
+define hidden void @_Z1av() {
+entry:
+  store i32 1, ptr @g, align 4
+  ret void
+}
+
+define hidden void @_Z1bv() !dbg !15 {
+entry:
+  call void @_Z1av(), !dbg !18
+  ret void, !dbg !19
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_17, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{!"clang version 19.0.0"}
+!15 = distinct !DISubprogram(name: "b", scope: !1, file: !1, line: 3, type: !16, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DISubroutineType(types: !17)
+!17 = !{}
+!18 = !DILocation(line: 3, scope: !15, atomGroup: 1, atomRank: 1)
+!19 = !DILocation(line: 3, scope: !15, atomGroup: 2, atomRank: 1)
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/jump-threading-2-bbs.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/jump-threading-2-bbs.ll
new file mode 100644
index 0000000..3ca81f5
--- /dev/null
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/jump-threading-2-bbs.ll
@@ -0,0 +1,68 @@
+; RUN: opt -S -passes=jump-threading,verify %s | FileCheck %s
+
+;; Modified from llvm/test/Transforms/JumpThreading/thread-two-bbs.ll
+;;
+;; JumpThreading duplicates bb.cond2 to thread through bb.file to bb.f2 or exit.
+;;
+;; Check the duplicated instructions get remapped atom groups.
+
+; CHECK: bb.cond2:
+; CHECK-NEXT: call void @f1()
+; CHECK-NEXT: %tobool1 = icmp eq i32 %cond2, 0, !dbg [[G1R2:!.*]]
+; CHECK-NEXT: br i1 %tobool1, label %exit, label %exit, !dbg [[G1R1:!.*]]
+
+; CHECK: bb.cond2.thread:
+; CHECK-NEXT: %tobool12 = icmp eq i32 %cond2, 0, !dbg [[G2R2:!.*]]
+; CHECK-NEXT: br i1 %tobool12, label %bb.f2, label %exit, !dbg [[G2R1:!.*]]
+
+; CHECK: [[G1R2]] = !DILocation(line: 1, column: 1, scope: ![[#]], atomGroup: 1, atomRank: 2)
+; CHECK: [[G1R1]] = !DILocation(line: 1, column: 1, scope: ![[#]], atomGroup: 1, atomRank: 1)
+; CHECK: [[G2R2]] = !DILocation(line: 1, column: 1, scope: ![[#]], atomGroup: 2, atomRank: 2)
+; CHECK: [[G2R1]] = !DILocation(line: 1, column: 1, scope: ![[#]], atomGroup: 2, atomRank: 1)
+
+@a = global i32 0, align 4
+
+define void @foo(i32 %cond1, i32 %cond2) !dbg !5 {
+entry:
+  %tobool = icmp eq i32 %cond1, 0
+  br i1 %tobool, label %bb.cond2, label %bb.f1
+
+bb.f1:                                            ; preds = %entry
+  call void @f1()
+  br label %bb.cond2
+
+bb.cond2:                                         ; preds = %bb.f1, %entry
+  %ptr = phi ptr [ null, %bb.f1 ], [ @a, %entry ]
+  %tobool1 = icmp eq i32 %cond2, 0, !dbg !9
+  br i1 %tobool1, label %bb.file, label %exit, !dbg !10
+
+bb.file:                                          ; preds = %bb.cond2
+  %cmp = icmp eq ptr %ptr, null
+  br i1 %cmp, label %exit, label %bb.f2
+
+bb.f2:                                            ; preds = %bb.file
+  call void @f2()
+  br label %exit
+
+exit:                                             ; preds = %bb.f2, %bb.file, %bb.cond2
+  ret void
+}
+
+declare void @f1()
+
+declare void @f2()
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{i32 16}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!9 = !DILocation(line: 1, column: 1, scope: !5, atomGroup: 1, atomRank: 2)
+!10 = !DILocation(line: 1, column: 1, scope: !5, atomGroup: 1, atomRank: 1)
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/jump-threading-dup-cond-br-on-phi-into-pred.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/jump-threading-dup-cond-br-on-phi-into-pred.ll
new file mode 100644
index 0000000..1ab1c1a
--- /dev/null
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/jump-threading-dup-cond-br-on-phi-into-pred.ll
@@ -0,0 +1,87 @@
+; RUN: opt %s --passes=jump-threading -S -o - -S | FileCheck %s
+
+;;        +-> T1 -+
+;;        |       v      +-> T2
+;; Entry -+       Merge -+
+;;        |       ^      +-> F2
+;;        +-> F1 -+
+;;
+;; Duplicate Merge into T1 then fold Merge into its only pred F1 (taking its name).
+;;
+;;        +-> T1 -----> T2
+;;        |     \       ^
+;;        |      \     /
+;;        |       \   /
+;; Entry -+        +----+
+;;        |         /   v 
+;;        +--> Merge -> F2
+;;
+;; Check the duplicated (into T1) instructions' atoms are remapped.
+
+; CHECK: T1:
+; CHECK-NEXT: %v1 = call i32 @f1()
+; CHECK-NEXT: %cond3 = icmp eq i32 %v1, 412
+; CHECK-NEXT: %C1 = add i32 %v1, 1, !dbg [[G3R2:!.*]]
+; CHECK-NEXT: store i32 %C1, ptr %p, align 4, !dbg [[G3R1:!.*]]
+
+; CHECK: Merge:
+; CHECK-NEXT: %v2 = call i32 @f2()
+; CHECK-NEXT: store i32 1, ptr %p, align 4, !dbg [[G1R1:!.*]]
+; CHECK-NEXT: %C = add i32 %v2, 1, !dbg [[G2R2:!.*]]
+; CHECK-NEXT: store i32 %C, ptr %p, align 4, !dbg [[G2R1:!.*]]
+
+; CHECK: [[G3R2]] = !DILocation({{.*}}, atomGroup: 3, atomRank: 2)
+; CHECK: [[G3R1]] = !DILocation({{.*}}, atomGroup: 3, atomRank: 1)
+; CHECK: [[G1R1]] = !DILocation({{.*}}, atomGroup: 1, atomRank: 1)
+; CHECK: [[G2R2]] = !DILocation({{.*}}, atomGroup: 2, atomRank: 2)
+; CHECK: [[G2R1]] = !DILocation({{.*}}, atomGroup: 2, atomRank: 1)
+
+define i32 @test5(i1 %cond, i1 %cond2, ptr %p) !dbg !5 {
+  br i1 %cond, label %T1, label %F1
+
+T1:                                               ; preds = %0
+  %v1 = call i32 @f1()
+  %cond3 = icmp eq i32 %v1, 412
+  br label %Merge
+
+F1:                                               ; preds = %0
+  %v2 = call i32 @f2()
+  store i32 1, ptr %p, align 4, !dbg !8
+  br label %Merge
+
+Merge:                                            ; preds = %F1, %T1
+  %A = phi i1 [ %cond3, %T1 ], [ %cond2, %F1 ]
+  %B = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %C = add i32 %B, 1, !dbg !9
+  store i32 %C, ptr %p, align 4, !dbg !10
+  br i1 %A, label %T2, label %F2
+
+T2:                                               ; preds = %Merge
+  call void @f3()
+  ret i32 %B
+
+F2:                                               ; preds = %Merge
+  ret i32 %B
+}
+
+declare i32 @f1()
+
+declare i32 @f2()
+
+declare void @f3()
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.ll", directory: "/")
+!2 = !{i32 12}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test5", linkageName: "test5", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5, atomGroup: 1, atomRank: 1)
+!9 = !DILocation(line: 2, column: 1, scope: !5, atomGroup: 2, atomRank: 2)
+!10 = !DILocation(line: 2, column: 1, scope: !5, atomGroup: 2, atomRank: 1)
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/loop-rotate.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/loop-rotate.ll
new file mode 100644
index 0000000..6da7146
--- /dev/null
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/loop-rotate.ll
@@ -0,0 +1,74 @@
+; RUN: opt --passes=loop-rotate %s -S -o - | FileCheck %s
+
+;; Rotate:
+;;                +------------------> for.end.
+;;                |
+;;    entry -> for.cond -> for.body
+;;                ^           |
+;;                +-----------+
+;;
+;; Into:
+;;
+;;                               +------> for.end.
+;;                               |
+;;    entry (+ for.cond`0) -> for.body (+ for.cond) -+
+;;                               ^                   |
+;;                               +-------------------+
+;; Check for.cond's duplicated store and br have their source atoms remapped.
+
+; CHECK: entry:
+; CHECK:   store i32 0, ptr @glob, align 16, !dbg [[G3R1:![0-9]+]]
+; CHECK:   br label %for.body, !dbg [[G4R1:![0-9]+]]
+;
+; CHECK: for.body:
+; CHECK:    store i32 {{.*}}, ptr @glob, align 16, !dbg [[G1R1:![0-9]+]]
+; CHECK:    [[CMP:%.*]] = icmp slt i32 {{.*}}, 100, !dbg [[G2R2:![0-9]+]]
+; CHECK:    br i1 [[CMP]], label %for.body, label %for.end, !dbg [[G2R1:![0-9]+]]
+;
+; CHECK: [[G3R1]] = !DILocation(line: 4{{.*}}, atomGroup: 3, atomRank: 1)
+; CHECK: [[G4R1]] = !DILocation(line: 6{{.*}}, atomGroup: 4, atomRank: 1)
+; CHECK: [[G1R1]] = !DILocation(line: 4{{.*}}, atomGroup: 1, atomRank: 1)
+; CHECK: [[G2R2]] = !DILocation(line: 5{{.*}}, atomGroup: 2, atomRank: 2)
+; CHECK: [[G2R1]] = !DILocation(line: 6{{.*}}, atomGroup: 2, atomRank: 1)
+
+@glob = global i32 0
+
+define void @test1() #0 !dbg !5 {
+entry:
+  %array = alloca [20 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  store i32 %i.0, ptr @glob, align 16,         !dbg !DILocation(line: 4, scope: !5, atomGroup: 1, atomRank: 1)
+  %cmp = icmp slt i32 %i.0, 100,               !dbg !DILocation(line: 5, scope: !5, atomGroup: 2, atomRank: 2)
+  br i1 %cmp, label %for.body, label %for.end, !dbg !DILocation(line: 6, scope: !5, atomGroup: 2, atomRank: 1)
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %i.0, 1
+  store i32 0, ptr %array, align 16
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arrayidx.lcssa = phi ptr [ %array, %for.cond ]
+  call void @g(ptr %arrayidx.lcssa)
+  ret void
+}
+
+declare void @g(ptr)
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { noduplicate }
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.ll", directory: "/")
+!2 = !{i32 12}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test1", linkageName: "test1", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/simplifycfg-thread-phi.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/simplifycfg-thread-phi.ll
new file mode 100644
index 0000000..f847760
--- /dev/null
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/simplifycfg-thread-phi.ll
@@ -0,0 +1,62 @@
+; RUN: opt %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S \
+; RUN: | FileCheck %s
+
+;; Generated using:
+;;   opt -passes=debugify --debugify-atoms --debugify-level=locations \
+;;      llvm/test/Transforms/SimplifyCFG/debug-info-thread-phi.ll
+;; With unused/untested metadata nodes removed.
+
+;; Check the duplicated store gets distinct atom info in each branch.
+
+; CHECK-LABEL: @bar(
+; CHECK: if.then:
+; CHECK:   store i32 1{{.*}}, !dbg [[DBG1:!.*]]
+; CHECK: if.end.1.critedge:
+; CHECK:   store i32 1{{.*}}, !dbg [[DBG2:!.*]]
+; CHECK: [[DBG1]] = !DILocation(line: 1{{.*}}, atomGroup: 1
+; CHECK: [[DBG2]] = !DILocation(line: 1{{.*}}, atomGroup: 2
+
+define void @bar(i32 %aa) !dbg !5 {
+entry:
+  %aa.addr = alloca i32, align 4
+  %bb = alloca i32, align 4
+  store i32 %aa, ptr %aa.addr, align 4
+  store i32 0, ptr %bb, align 4
+  %tobool = icmp ne i32 %aa, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, ptr %bb, align 4, !dbg !8
+  br i1 %tobool, label %if.then.1, label %if.end.1
+
+if.then.1:                                        ; preds = %if.end
+  call void @foo()
+  br label %if.end.1
+
+if.end.1:                                         ; preds = %if.then.1, %if.end
+  store i32 2, ptr %bb, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %if.end.1
+  ret void
+}
+
+declare void @foo()
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "llvm/test/Transforms/SimplifyCFG/debug-info-thread-phi.ll", directory: "/")
+!2 = !{i32 15}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5, atomGroup: 1, atomRank: 1)
diff --git a/llvm/test/DebugInfo/MIR/AArch64/clobber-sp.mir b/llvm/test/DebugInfo/MIR/AArch64/clobber-sp.mir
index c245684..b248ee7 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/clobber-sp.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/clobber-sp.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after=livedebugvalues -filetype=obj -o - %s \
+# RUN: llc -start-before=aarch64-asm-printer -filetype=obj -o - %s \
 # RUN:   | llvm-dwarfdump - | FileCheck %s
 # CHECK: .debug_info contents:
 # CHECK: DW_TAG_formal_parameter
diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir
index 02f4ce1..0f949f4 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj -o - %s \
+# RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -debug-entry-values -start-before=aarch64-asm-printer -filetype=obj -o - %s \
 # RUN:     | llvm-dwarfdump - | FileCheck %s --implicit-check-not=DW_TAG_GNU_call_site_parameter
 #
 # Based on the following C reproducer:
diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-entry-value.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-entry-value.mir
index 6c89123..d7740e0 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-entry-value.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-entry-value.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj -o - %s \
+# RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -debug-entry-values -start-before=aarch64-asm-printer -filetype=obj -o - %s \
 # RUN:     | llvm-dwarfdump - | FileCheck %s --implicit-check-not=DW_TAG_GNU_call_site_parameter
 #
 # Based on the following C reproducer:
diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir
index 3891219..bf04b20 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir
@@ -1,4 +1,4 @@
-# RUN: llc -emit-call-site-info -start-after=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+# RUN: llc -emit-call-site-info -start-before=aarch64-asm-printer -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
 
 # Based on the following C reproducer:
 #
diff --git a/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir b/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir
index 1479b37..113a003 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir
@@ -1,4 +1,4 @@
-# RUN: llc -emit-call-site-info -start-after=livedebugvalues -filetype=obj -o - %s \
+# RUN: llc -emit-call-site-info -start-before=aarch64-asm-printer -filetype=obj -o - %s \
 # RUN:   | llvm-dwarfdump -v - | FileCheck %s
 
 # This tests for a crash in DwarfDebug's singular DBG_VALUE range promotion when
diff --git a/llvm/test/DebugInfo/MIR/AArch64/no-dbg-value-after-terminator.mir b/llvm/test/DebugInfo/MIR/AArch64/no-dbg-value-after-terminator.mir
index 22e9668..34c099f 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/no-dbg-value-after-terminator.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/no-dbg-value-after-terminator.mir
@@ -1,4 +1,4 @@
-# RUN: not --crash llc -mtriple aarch64-linux-gnu -verify-machineinstrs -start-after=livedebugvalues \
+# RUN: not --crash llc -mtriple aarch64-linux-gnu -verify-machineinstrs -start-before=aarch64-asm-printer \
 # RUN:   -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s
 
 # CHECK: *** Bad machine code: Non-terminator instruction after the first terminator ***
diff --git a/llvm/test/DebugInfo/MIR/AArch64/subreg-fragment-overflow.mir b/llvm/test/DebugInfo/MIR/AArch64/subreg-fragment-overflow.mir
index d0b1384..19a90b6 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/subreg-fragment-overflow.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/subreg-fragment-overflow.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple aarch64-linux-gnu -start-after=livedebugvalues -filetype=obj -o - %s \
+# RUN: llc -mtriple aarch64-linux-gnu -start-before=aarch64-asm-printer -filetype=obj -o - %s \
 # RUN:     | llvm-dwarfdump - | FileCheck %s
 # The value needs to be composed of sub-registers, but the
 # sub-registers cross the fragment boundary.
diff --git a/llvm/test/DebugInfo/X86/single-location.mir b/llvm/test/DebugInfo/X86/single-location.mir
index 79049e3..a1a167e 100644
--- a/llvm/test/DebugInfo/X86/single-location.mir
+++ b/llvm/test/DebugInfo/X86/single-location.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after=livedebugvalues --filetype=obj %s -o - \
+# RUN: llc -start-before=x86-asm-printer --filetype=obj %s -o - \
 # RUN:     | llvm-dwarfdump -v - | FileCheck %s
 #
 # Generated at -O2, stopped after livedebugvalues, with some metadata removed
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s
index 080341a..d4cf709 100644
--- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s
+++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s
@@ -33,7 +33,28 @@ test_got:
         leal    named_data2@GOT+5, %eax
         .size   test_got, .-test_got
 
+# Test R_386_GOT32X handling.
+#
+# We want to check both the offset to the GOT entry and its contents.
+# jitlink-check: decode_operand(test_gotx_load, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data1) - _GLOBAL_OFFSET_TABLE_
+# jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_got.o, named_data1)) = named_data1
 
+        .globl test_gotx
+        .p2align      4, 0x90
+        .type   test_gotx,@function
+test_gotx:
+	calll	.L0$pb
+.L0$pb:
+	popl	%eax
+.Ltmp0:
+	addl	$_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
+        .globl test_gotx_load
+test_gotx_load:
+	movl	named_data1@GOT(%eax), %eax
+        .size   test_gotx_load, .-test_gotx_load
+	movl	(%eax), %eax
+	retl
+        .size   test_gotx, .-test_gotx
 
 # Test GOTOFF64 handling.
 # jitlink-check: decode_operand(test_gotoff, 1) = named_func - _GLOBAL_OFFSET_TABLE_ + 99
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-callback.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-callback.ll
new file mode 100644
index 0000000..e95b69c
--- /dev/null
+++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-callback.ll
@@ -0,0 +1,253 @@
+; This check verifies that stack depth callback instrumentation works correctly.
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=1 -S | FileCheck %s --check-prefixes=COMMON,CB1
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=8 -S | FileCheck %s --check-prefixes=COMMON,CB8
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=16 -S | FileCheck %s --check-prefixes=COMMON,CB16
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=32 -S | FileCheck %s --check-prefixes=COMMON,CB32
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=64 -S | FileCheck %s --check-prefixes=COMMON,CB64
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=128 -S | FileCheck %s --check-prefixes=COMMON,CB128
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; No stack, just return: our leaf function
+define i32 @foo() {
+; COMMON-LABEL: define i32 @foo() {
+; COMMON-NEXT:  entry:
+; CB1-NOT:        call void @__sanitizer_cov_stack_depth()
+; CB8-NOT:        call void @__sanitizer_cov_stack_depth()
+; CB16-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB32-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB64-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    ret i32 7
+;
+entry:
+
+  ret i32 7
+}
+
+; No stack, just function call
+define i32 @retcall() {
+; COMMON-LABEL: define i32 @retcall() {
+; COMMON-NEXT:  entry:
+; CB1-NOT:        call void @__sanitizer_cov_stack_depth()
+; CB8-NOT:        call void @__sanitizer_cov_stack_depth()
+; CB16-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB32-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB64-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; No stack, just function call, with argument
+define i32 @witharg(i32 %input) {
+; COMMON-LABEL: define i32 @witharg(i32 %input) {
+; COMMON-NEXT:  entry:
+; CB1-NOT:        call void @__sanitizer_cov_stack_depth()
+; CB8-NOT:        call void @__sanitizer_cov_stack_depth()
+; CB16-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB32-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB64-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; 4 byte stack of scalars
+define i32 @alloc4_0() {
+; COMMON-LABEL: define i32 @alloc4_0() {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i32, align 4
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NOT:        call void @__sanitizer_cov_stack_depth()
+; CB16-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB32-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB64-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+  %var1 = alloca i32, align 4
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; 16 byte stack of scalars
+define i32 @alloc16_0() {
+; COMMON-LABEL: define i32 @alloc16_0() {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i32, align 4
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i32, align 4
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i32, align 4
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i32, align 4
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB16-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB32-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB64-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+  %var1 = alloca i32, align 4
+  %var2 = alloca i32, align 4
+  %var3 = alloca i32, align 4
+  %var4 = alloca i32, align 4
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; 32 byte stack of scalars
+define i32 @alloc32_0() {
+; COMMON-LABEL: define i32 @alloc32_0() {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i64, align 8
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i64, align 8
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i64, align 8
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i64, align 8
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB16-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB32-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB64-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+  %var1 = alloca i64, align 8
+  %var2 = alloca i64, align 8
+  %var3 = alloca i64, align 8
+  %var4 = alloca i64, align 8
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; 36 byte stack of 1 4 byte scalar and 1 32 byte array
+define i32 @alloc4_32x1() {
+; COMMON-LABEL: define i32 @alloc4_32x1() {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i8, i32 32, align 4
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i32, align 4
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB16-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB32-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB64-NOT:       call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+  %stack_array1 = alloca i8, i32 32, align 4
+  %var1 = alloca i32, align 4
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; 64 byte stack of 2 32 byte arrays
+define i32 @alloc0_32x2() {
+; COMMON-LABEL: define i32 @alloc0_32x2() {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i8, i32 32, align 4
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i8, i32 32, align 4
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB16-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB32-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB64-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+  %stack_array1 = alloca i8, i32 32, align 4
+  %stack_array2 = alloca i8, i32 32, align 4
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; 64 byte stack of 1 64 byte array
+define i32 @alloc0_64x1() {
+; COMMON-LABEL: define i32 @alloc0_64x1() {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i8, i32 64, align 4
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB16-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB32-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB64-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB128-NOT:      call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+  %stack_array = alloca i8, i32 64, align 4
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; dynamic stack sized by i32
+define i32 @alloc0_32xDyn(i32 %input) {
+; COMMON-LABEL: define i32 @alloc0_32xDyn(i32 %input) {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i8, i32 %input, align 4
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB16-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB32-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB64-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB128-NEXT:     call void @__sanitizer_cov_stack_depth()
+; COMMON-NEXT:    [[CALL:%.*]] = call i32 @foo()
+; COMMON-NEXT:    ret i32 [[CALL]]
+entry:
+  %stack_array1 = alloca i8, i32 %input, align 4
+
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; true dynamic stack sized by i32, from C:
+; static int dyamic_alloca(int size)
+; {
+;   int array[size];
+;   return foo();
+; }
+define dso_local i32 @dynamic_alloca(i32 noundef %0) #0 {
+  %2 = alloca i32, align 4
+  %3 = alloca ptr, align 8
+  %4 = alloca i64, align 8
+  store i32 %0, ptr %2, align 4
+  %5 = load i32, ptr %2, align 4
+  %6 = zext i32 %5 to i64
+; COMMON-LABEL:   %7 = call ptr @llvm.stacksave
+; COMMON-NEXT:    store ptr %7, ptr %3, align 8
+; COMMON-NEXT:    [[VAR:%.*]] = alloca i32, i64 %6, align 16
+; CB1-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB8-NEXT:       call void @__sanitizer_cov_stack_depth()
+; CB16-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB32-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB64-NEXT:      call void @__sanitizer_cov_stack_depth()
+; CB128-NEXT:     call void @__sanitizer_cov_stack_depth()
+  %7 = call ptr @llvm.stacksave.p0()
+  store ptr %7, ptr %3, align 8
+  %8 = alloca i32, i64 %6, align 16
+  store i64 %6, ptr %4, align 8
+  %9 = call i32 @foo()
+  %10 = load ptr, ptr %3, align 8
+; COMMON-LABEL: call void @llvm.stackrestore
+; COMMON-NEXT: ret i32 %9
+  call void @llvm.stackrestore.p0(ptr %10)
+  ret i32 %9
+}
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard.ll b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard.ll
index 5deb74e..5d46c23 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-guard.ll
@@ -1,8 +1,9 @@
-; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,COMDAT,ELF
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,CHECK-CTOR,COMDAT,ELF,ELF-CTOR
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -sanitizer-coverage-drop-ctors=1 -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,COMDAT,ELF
 
-; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -mtriple=aarch64-apple-darwin -S | FileCheck %s --check-prefixes=CHECK,MACHO
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -mtriple=aarch64-apple-darwin -S | FileCheck %s --check-prefixes=CHECK,CHECK-CTOR,MACHO
 
-; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -mtriple=x86_64-windows -S | FileCheck %s --check-prefixes=CHECK,COMDAT,WIN
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard -mtriple=x86_64-windows -S | FileCheck %s --check-prefixes=CHECK,CHECK-CTOR,COMDAT,WIN
 
 ; COMDAT:     $foo = comdat nodeduplicate
 ; COMDAT:     $CallViaVptr = comdat nodeduplicate
@@ -20,7 +21,7 @@
 ; WIN-NEXT:   @__sancov_gen_.1 = private global [1 x i32] zeroinitializer, section ".SCOV$GM", comdat($CallViaVptr), align 4{{$}}
 ; WIN-NEXT:   @__sancov_gen_.2 = private global [1 x i32] zeroinitializer, section ".SCOV$GM", comdat($DirectBitcastCall), align 4{{$}}
 
-; ELF:        @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_trace_pc_guard]
+; ELF-CTOR:   @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_trace_pc_guard]
 ; ELF:        @llvm.compiler.used = appending global [3 x ptr] [ptr @__sancov_gen_, ptr @__sancov_gen_.1, ptr @__sancov_gen_.2], section "llvm.metadata"
 ; MACHO:      @llvm.used = appending global [4 x ptr] [ptr @sancov.module_ctor_trace_pc_guard, ptr @__sancov_gen_, ptr @__sancov_gen_.1, ptr @__sancov_gen_.2]
 ; MACHO-NOT:  @llvm.compiler.used =
@@ -73,7 +74,7 @@ define void @DirectBitcastCall() sanitize_address {
   ret void
 }
 
-; ELF-LABEL: define internal void @sancov.module_ctor_trace_pc_guard() #2 comdat {
+; ELF-CTOR-LABEL: define internal void @sancov.module_ctor_trace_pc_guard() #2 comdat {
 ; MACHO-LABEL: define internal void @sancov.module_ctor_trace_pc_guard() #2 {
 
-; CHECK: attributes #2 = { nounwind }
+; CHECK-CTOR: attributes #2 = { nounwind }
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index 099916f..29838af 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -495,5 +495,5 @@ v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16
 // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
 v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16
 
-// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
 v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5
diff --git a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
index b0f3a8a..1a94342 100644
--- a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
+++ b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
@@ -10,4 +10,4 @@
 0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04
 
 # GFX950: warning: invalid instruction encoding
-0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04
-\ No newline at end of file
+0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index bd8ecd0..b23ae44 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -23,7 +23,7 @@ bnez $t1, %b21(foo)
 
 bl %plt(foo)
 # RELOC: R_LARCH_B26
-# INSTR: bl %plt(foo)
+# INSTR: bl foo
 
 bl foo
 # RELOC: R_LARCH_B26
diff --git a/llvm/test/Transforms/Attributor/nofpclass-minimumnum-maximumnum.ll b/llvm/test/Transforms/Attributor/nofpclass-minimumnum-maximumnum.ll
new file mode 100644
index 0000000..f21e9af
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/nofpclass-minimumnum-maximumnum.ll
@@ -0,0 +1,796 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT
+
+declare float @llvm.minimumnum.f32(float, float)
+declare float @llvm.maximumnum.f32(float, float)
+declare <2 x float> @llvm.minimumnum.v2f32(<2 x float>, <2 x float>)
+
+define float @ret_minimumnum(float %arg0, float %arg1) #0 {
+; CHECK-LABEL: define float @ret_minimumnum
+; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9:[0-9]+]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noinf__noinf(float nofpclass(inf) %arg0, float nofpclass(inf) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(inf) float @ret_minimumnum_noinf__noinf
+; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.minimumnum.f32(float nofpclass(inf) [[ARG0]], float nofpclass(inf) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noinf__nonan(float nofpclass(inf) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_noinf__nonan
+; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(inf) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nonan__noinf(float nofpclass(nan) %arg0, float nofpclass(inf) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_nonan__noinf
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(inf) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noinf_nonan__nonan(float nofpclass(inf nan) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_noinf_nonan__nonan
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(nan inf) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nonan__noinf_nonan(float nofpclass(nan) %arg0, float nofpclass(inf nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_nonan__noinf_nonan
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(nan inf) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_norm_zero__norm_sub(float nofpclass(norm zero) %arg0, float nofpclass(norm sub) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(norm) float @ret_minimumnum_norm_zero__norm_sub
+; CHECK-SAME: (float nofpclass(zero norm) [[ARG0:%.*]], float nofpclass(sub norm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(norm) float @llvm.minimumnum.f32(float nofpclass(zero norm) [[ARG0]], float nofpclass(sub norm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum(float %arg0, float %arg1) #0 {
+; CHECK-LABEL: define float @ret_maximumnum
+; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noinf__noinf(float nofpclass(inf) %arg0, float nofpclass(inf) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(inf) float @ret_maximumnum_noinf__noinf
+; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf) float @llvm.maximumnum.f32(float nofpclass(inf) [[ARG0]], float nofpclass(inf) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noinf__nonan(float nofpclass(inf) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_noinf__nonan
+; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(inf) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nonan__noinf(float nofpclass(nan) %arg0, float nofpclass(inf) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_nonan__noinf
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(inf) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noinf_nonan__nonan(float nofpclass(inf nan) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_noinf_nonan__nonan
+; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(nan inf) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nonan__noinf_nonan(float nofpclass(nan) %arg0, float nofpclass(inf nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_nonan__noinf_nonan
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(nan inf) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_norm_zero__norm_sub(float nofpclass(norm zero) %arg0, float nofpclass(norm sub) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(norm) float @ret_maximumnum_norm_zero__norm_sub
+; CHECK-SAME: (float nofpclass(zero norm) [[ARG0:%.*]], float nofpclass(sub norm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(norm) float @llvm.maximumnum.f32(float nofpclass(zero norm) [[ARG0]], float nofpclass(sub norm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_daz_daz(float %arg0, float %arg1) #1 {
+; CHECK-LABEL: define float @ret_minimumnum_daz_daz
+; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dapz_dappz(float %arg0, float %arg1) #2 {
+; CHECK-LABEL: define float @ret_minimumnum_dapz_dappz
+; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+define float @ret_minimumnum_dynamic_dynamic(float %arg0, float %arg1) #3 {
+; CHECK-LABEL: define float @ret_minimumnum_dynamic_dynamic
+; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noinf_nozero__noinf_nozero(float nofpclass(inf zero) %arg0, float nofpclass(inf zero) %arg1) #1 {
+; CHECK-LABEL: define nofpclass(inf zero) float @ret_minimumnum_noinf_nozero__noinf_nozero
+; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]], float nofpclass(inf zero) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf zero) float @llvm.minimumnum.f32(float nofpclass(inf zero) [[ARG0]], float nofpclass(inf zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define <2 x float> @ret_minimumnum_noinf_nozero__noinf_nozero_v2f32(<2 x float> nofpclass(inf zero) %arg0, <2 x float> nofpclass(inf zero) %arg1) #1 {
+; CHECK-LABEL: define nofpclass(inf zero) <2 x float> @ret_minimumnum_noinf_nozero__noinf_nozero_v2f32
+; CHECK-SAME: (<2 x float> nofpclass(inf zero) [[ARG0:%.*]], <2 x float> nofpclass(inf zero) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(inf zero) <2 x float> @llvm.minimumnum.v2f32(<2 x float> nofpclass(inf zero) [[ARG0]], <2 x float> nofpclass(inf zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret <2 x float> [[CALL]]
+;
+  %call = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %arg0, <2 x float> %arg1)
+  ret <2 x float> %call
+}
+
+define float @ret_minimumnum_daz_daz_nozero__nozero(float nofpclass(zero) %arg0, float nofpclass(zero) %arg1) #1 {
+; CHECK-LABEL: define nofpclass(zero) float @ret_minimumnum_daz_daz_nozero__nozero
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.minimumnum.f32(float nofpclass(zero) [[ARG0]], float nofpclass(zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dapz_dapz_nozero__nozero(float nofpclass(zero) %arg0, float nofpclass(zero) %arg1) #2 {
+; CHECK-LABEL: define nofpclass(zero) float @ret_minimumnum_dapz_dapz_nozero__nozero
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.minimumnum.f32(float nofpclass(zero) [[ARG0]], float nofpclass(zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dynamic_dynamic_nozero__nozero(float nofpclass(zero) %arg0, float nofpclass(zero) %arg1) #3 {
+; CHECK-LABEL: define nofpclass(zero) float @ret_minimumnum_dynamic_dynamic_nozero__nozero
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.minimumnum.f32(float nofpclass(zero) [[ARG0]], float nofpclass(zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_daz_daz_nozero_nosub__nozero_nosub(float nofpclass(zero sub) %arg0, float nofpclass(zero sub) %arg1) #1 {
+; CHECK-LABEL: define nofpclass(zero sub) float @ret_minimumnum_daz_daz_nozero_nosub__nozero_nosub
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]], float nofpclass(zero sub) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero sub) float @llvm.minimumnum.f32(float nofpclass(zero sub) [[ARG0]], float nofpclass(zero sub) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dynamic_dynamic_nozero_nosub__nozero_nosub(float nofpclass(zero sub) %arg0, float nofpclass(zero sub) %arg1) #3 {
+; CHECK-LABEL: define nofpclass(zero sub) float @ret_minimumnum_dynamic_dynamic_nozero_nosub__nozero_nosub
+; CHECK-SAME: (float nofpclass(zero sub) [[ARG0:%.*]], float nofpclass(zero sub) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero sub) float @llvm.minimumnum.f32(float nofpclass(zero sub) [[ARG0]], float nofpclass(zero sub) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_daz_daz_nopzero__nopzero(float nofpclass(pzero) %arg0, float nofpclass(pzero) %arg1) #1 {
+; CHECK-LABEL: define float @ret_minimumnum_daz_daz_nopzero__nopzero
+; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]], float nofpclass(pzero) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(pzero) [[ARG0]], float nofpclass(pzero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_daz_daz_nonzero__nonzero(float nofpclass(nzero) %arg0, float nofpclass(nzero) %arg1) #1 {
+; CHECK-LABEL: define float @ret_minimumnum_daz_daz_nonzero__nonzero
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]], float nofpclass(nzero) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(nzero) [[ARG0]], float nofpclass(nzero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_daz_daz_nonzero_nonsub__nonzero_nonsub(float nofpclass(nzero nsub) %arg0, float nofpclass(nzero nsub) %arg1) #1 {
+; CHECK-LABEL: define nofpclass(nsub) float @ret_minimumnum_daz_daz_nonzero_nonsub__nonzero_nonsub
+; CHECK-SAME: (float nofpclass(nzero nsub) [[ARG0:%.*]], float nofpclass(nzero nsub) [[ARG1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nsub) float @llvm.minimumnum.f32(float nofpclass(nzero nsub) [[ARG0]], float nofpclass(nzero nsub) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dapz_dapz_nopzero__nopzero(float nofpclass(pzero) %arg0, float nofpclass(pzero) %arg1) #2 {
+; CHECK-LABEL: define float @ret_minimumnum_dapz_dapz_nopzero__nopzero
+; CHECK-SAME: (float nofpclass(pzero) [[ARG0:%.*]], float nofpclass(pzero) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(pzero) [[ARG0]], float nofpclass(pzero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dapz_dapz_nopzero_nopsub__nopzero_nopsub(float nofpclass(pzero psub) %arg0, float nofpclass(pzero psub) %arg1) #2 {
+; CHECK-LABEL: define nofpclass(psub) float @ret_minimumnum_dapz_dapz_nopzero_nopsub__nopzero_nopsub
+; CHECK-SAME: (float nofpclass(pzero psub) [[ARG0:%.*]], float nofpclass(pzero psub) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(psub) float @llvm.minimumnum.f32(float nofpclass(pzero psub) [[ARG0]], float nofpclass(pzero psub) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dapz_dapz_nonzero__nonzero(float nofpclass(nzero) %arg0, float nofpclass(nzero) %arg1) #2 {
+; CHECK-LABEL: define float @ret_minimumnum_dapz_dapz_nonzero__nonzero
+; CHECK-SAME: (float nofpclass(nzero) [[ARG0:%.*]], float nofpclass(nzero) [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(nzero) [[ARG0]], float nofpclass(nzero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_ieee_daz_nozero__nozero(float nofpclass(zero) %arg0, float nofpclass(zero) %arg1) #4 {
+; CHECK-LABEL: define nofpclass(zero) float @ret_minimumnum_ieee_daz_nozero__nozero
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.minimumnum.f32(float nofpclass(zero) [[ARG0]], float nofpclass(zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_daz_ieee_nozero__nozero(float nofpclass(zero) %arg0, float nofpclass(zero) %arg1) #5 {
+; CHECK-LABEL: define nofpclass(zero) float @ret_minimumnum_daz_ieee_nozero__nozero
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.minimumnum.f32(float nofpclass(zero) [[ARG0]], float nofpclass(zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_ieee_dapz_nozero__nozero(float nofpclass(zero) %arg0, float nofpclass(zero) %arg1) #6 {
+; CHECK-LABEL: define nofpclass(zero) float @ret_minimumnum_ieee_dapz_nozero__nozero
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.minimumnum.f32(float nofpclass(zero) [[ARG0]], float nofpclass(zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_dapz_ieee_nozero__nozero(float nofpclass(zero) %arg0, float nofpclass(zero) %arg1) #7 {
+; CHECK-LABEL: define nofpclass(zero) float @ret_minimumnum_dapz_ieee_nozero__nozero
+; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(zero) float @llvm.minimumnum.f32(float nofpclass(zero) [[ARG0]], float nofpclass(zero) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_noneg_nan__any
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(nan ninf nsub nnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub nnorm nan) %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_any__noneg_nan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(nan ninf nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_minimumnum_nopos_nan__any
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf psub pnorm) float @llvm.minimumnum.f32(float nofpclass(nan pinf psub pnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__nopos_nan(float %arg0, float nofpclass(pinf psub pnorm nan) %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_minimumnum_any__nopos_nan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf psub pnorm) float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(nan pinf psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noneg__any(float nofpclass(ninf nsub nnorm) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define float @ret_minimumnum_noneg__any
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(ninf nsub nnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__noneg(float %arg0, float nofpclass(ninf nsub nnorm) %arg1) #3 {
+; CHECK-LABEL: define float @ret_minimumnum_any__noneg
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(ninf nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nopos__any(float nofpclass(pinf psub pnorm) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define float @ret_minimumnum_nopos__any
+; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(pinf psub pnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__nopos(float %arg0, float nofpclass(pinf psub pnorm) %arg1) #3 {
+; CHECK-LABEL: define float @ret_minimumnum_any__nopos
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(pinf psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noneg_nan__any(float nofpclass(ninf nsub nnorm nan) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_maximumnum_noneg_nan__any
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nsub nnorm) float @llvm.maximumnum.f32(float nofpclass(nan ninf nsub nnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__noneg_nan(float %arg0, float nofpclass(ninf nsub nnorm nan) %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_maximumnum_any__noneg_nan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan ninf nsub nnorm) float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(nan ninf nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nopos_nan__any(float nofpclass(pinf psub pnorm nan) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_nopos_nan__any
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(nan pinf psub pnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__nopos_nan(float %arg0, float nofpclass(pinf psub pnorm nan) %arg1) #3 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_any__nopos_nan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(nan pinf psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noneg__any(float nofpclass(ninf nsub nnorm) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define float @ret_maximumnum_noneg__any
+; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float nofpclass(ninf nsub nnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__noneg(float %arg0, float nofpclass(ninf nsub nnorm) %arg1) #3 {
+; CHECK-LABEL: define float @ret_maximumnum_any__noneg
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(ninf nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nopos__any(float nofpclass(pinf psub pnorm) %arg0, float %arg1) #3 {
+; CHECK-LABEL: define float @ret_maximumnum_nopos__any
+; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float nofpclass(pinf psub pnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__nopos(float %arg0, float nofpclass(pinf psub pnorm) %arg1) #3 {
+; CHECK-LABEL: define float @ret_maximumnum_any__nopos
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(pinf psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nopos__nonan(float nofpclass(pinf pnorm psub pzero) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_nopos__nonan
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(pinf pzero psub pnorm) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nonan__nopos(float nofpclass(nan) %arg0, float nofpclass(pinf pnorm psub pzero) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_nonan__nopos
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(pinf pzero psub pnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(pinf pzero psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nopos_nonan__any(float nofpclass(nan pinf pnorm psub pzero) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_minimumnum_nopos_nonan__any
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero psub pnorm) float @llvm.minimumnum.f32(float nofpclass(nan pinf pzero psub pnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__nopos_nonan(float %arg0, float nofpclass(nan pinf pnorm psub pzero) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_minimumnum_any__nopos_nonan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan pinf pzero psub pnorm) float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(nan pinf pzero psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noneg__nonan(float nofpclass(ninf nnorm nsub nzero) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_noneg__nonan
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(ninf nzero nsub nnorm) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nonan__noneg(float nofpclass(nan) %arg0, float nofpclass(ninf nnorm nsub nzero) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_nonan__noneg
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(ninf nzero nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noneg_nonan__any(float nofpclass(nan ninf nnorm nsub nzero) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_noneg_nonan__any
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float nofpclass(nan ninf nzero nsub nnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__noneg_nonan(float %arg0, float nofpclass(nan ninf nnorm nsub nzero) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_minimumnum_any__noneg_nonan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(nan ninf nzero nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nopos__nonan(float nofpclass(pinf pnorm psub pzero) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_nopos__nonan
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(pinf pzero psub pnorm) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nonan__nopos(float nofpclass(nan) %arg0, float nofpclass(pinf pnorm psub pzero) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_nonan__nopos
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(pinf pzero psub pnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(pinf pzero psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nopos_nonan__any(float nofpclass(nan pinf pnorm psub pzero) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_nopos_nonan__any
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(nan pinf pzero psub pnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__nopos_nonan(float %arg0, float nofpclass(nan pinf pnorm psub pzero) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_any__nopos_nonan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(nan pinf pzero psub pnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noneg__nonan(float nofpclass(ninf nnorm nsub nzero) %arg0, float nofpclass(nan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_noneg__nonan
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(ninf nzero nsub nnorm) [[ARG0]], float nofpclass(nan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nonan__noneg(float nofpclass(nan) %arg0, float nofpclass(ninf nnorm nsub nzero) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(nan) float @ret_maximumnum_nonan__noneg
+; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(nan) float @llvm.maximumnum.f32(float nofpclass(nan) [[ARG0]], float nofpclass(ninf nzero nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noneg_nonan__any(float nofpclass(ninf nnorm nsub nzero) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define float @ret_maximumnum_noneg_nonan__any
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float nofpclass(ninf nzero nsub nnorm) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__noneg_nonan(float %arg0, float nofpclass(ninf nnorm nsub nzero) %arg1) #0 {
+; CHECK-LABEL: define float @ret_maximumnum_any__noneg_nonan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(ninf nzero nsub nnorm) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nosnan__any(float nofpclass(snan) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define float @ret_minimumnum_nosnan__any
+; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(snan) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__nosnan(float %arg0, float nofpclass(snan) %arg1) #0 {
+; CHECK-LABEL: define float @ret_minimumnum_any__nosnan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(snan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(snan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nosnan__any(float nofpclass(snan) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define float @ret_maximumnum_nosnan__any
+; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float nofpclass(snan) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__nosnan(float %arg0, float nofpclass(snan) %arg1) #0 {
+; CHECK-LABEL: define float @ret_maximumnum_any__nosnan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(snan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(snan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_noqnan__any(float nofpclass(qnan) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define float @ret_minimumnum_noqnan__any
+; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(qnan) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_any__noqnan(float %arg0, float nofpclass(qnan) %arg1) #0 {
+; CHECK-LABEL: define float @ret_minimumnum_any__noqnan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(qnan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float [[ARG0]], float nofpclass(qnan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noqnan__any(float nofpclass(qnan) %arg0, float %arg1) #0 {
+; CHECK-LABEL: define float @ret_maximumnum_noqnan__any
+; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float nofpclass(qnan) [[ARG0]], float [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_any__noqnan(float %arg0, float nofpclass(qnan) %arg1) #0 {
+; CHECK-LABEL: define float @ret_maximumnum_any__noqnan
+; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(qnan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.maximumnum.f32(float [[ARG0]], float nofpclass(qnan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nosnan__nosnan(float nofpclass(snan) %arg0, float nofpclass(snan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(snan) float @ret_minimumnum_nosnan__nosnan
+; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]], float nofpclass(snan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan) float @llvm.minimumnum.f32(float nofpclass(snan) [[ARG0]], float nofpclass(snan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_nosnan__nosnan(float nofpclass(snan) %arg0, float nofpclass(snan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(snan) float @ret_maximumnum_nosnan__nosnan
+; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]], float nofpclass(snan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(snan) float @llvm.maximumnum.f32(float nofpclass(snan) [[ARG0]], float nofpclass(snan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_minimumnum_nosnan__noqnan(float nofpclass(snan) %arg0, float nofpclass(qnan) %arg1) #0 {
+; CHECK-LABEL: define float @ret_minimumnum_nosnan__noqnan
+; CHECK-SAME: (float nofpclass(snan) [[ARG0:%.*]], float nofpclass(qnan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call float @llvm.minimumnum.f32(float nofpclass(snan) [[ARG0]], float nofpclass(qnan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.minimumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+define float @ret_maximumnum_noqnan__nosnan(float nofpclass(qnan) %arg0, float nofpclass(qnan) %arg1) #0 {
+; CHECK-LABEL: define nofpclass(qnan) float @ret_maximumnum_noqnan__nosnan
+; CHECK-SAME: (float nofpclass(qnan) [[ARG0:%.*]], float nofpclass(qnan) [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call nofpclass(qnan) float @llvm.maximumnum.f32(float nofpclass(qnan) [[ARG0]], float nofpclass(qnan) [[ARG1]]) #[[ATTR9]]
+; CHECK-NEXT:    ret float [[CALL]]
+;
+  %call = call float @llvm.maximumnum.f32(float %arg0, float %arg1)
+  ret float %call
+}
+
+attributes #0 = { "denormal-fp-math"="ieee,ieee" }
+attributes #1 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #2 = { "denormal-fp-math"="positive-zero,positive-zero" }
+attributes #3 = { "denormal-fp-math"="dynamic,dynamic" }
+attributes #4 = { "denormal-fp-math"="ieee,preserve-sign" }
+attributes #5 = { "denormal-fp-math"="preserve-sign,ieee" }
+attributes #6 = { "denormal-fp-math"="ieee,positive-zero" }
+attributes #7 = { "denormal-fp-math"="positive-zero,ieee" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/HipStdPar/allocation-interposition.ll b/llvm/test/Transforms/HipStdPar/allocation-interposition.ll
index 9ec284b..bdc9951 100644
--- a/llvm/test/Transforms/HipStdPar/allocation-interposition.ll
+++ b/llvm/test/Transforms/HipStdPar/allocation-interposition.ll
@@ -16,6 +16,16 @@ declare void @__hipstdpar_hidden_free(ptr)
 
 declare ptr @__hipstdpar_hidden_malloc(i64)
 
+declare ptr @__hipstdpar_hidden_memalign(i64, i64)
+
+declare ptr @__hipstdpar_hidden_mmap(ptr, i64, i32, i32, i32, i64)
+
+declare i32 @__hipstdpar_hidden_munmap(ptr, i64)
+
+declare ptr @__hipstdpar_mmap(ptr, i64, i32, i32, i32, i64)
+
+declare i32 @__hipstdpar_munmap(ptr, i64)
+
 declare ptr @__hipstdpar_realloc(ptr, i64)
 
 declare ptr @__hipstdpar_realloc_array(ptr, i64, i64)
@@ -171,7 +181,21 @@ define dso_local noundef i32 @allocs() {
   ; CHECK: call void @__hipstdpar_free(ptr noundef %28)
   call void @__libc_free(ptr noundef %28)
 
-  ret i32 0
+  ; CHECK: %29 = call ptr @__libc_malloc(i64 noundef 8)
+  %29 = call ptr @__hipstdpar_hidden_malloc(i64 noundef 8)
+  ; CHECK: call void @__libc_free(ptr noundef %29)
+  call void @__hipstdpar_hidden_free(ptr noundef %29)
+
+  ; CHECK: %30 = call ptr @__libc_memalign(i64 noundef 8, i64 noundef 4)
+  %30 = call ptr @__hipstdpar_hidden_memalign(i64 noundef 8, i64 noundef 4)
+  ; CHECK: %31 = call ptr @mmap(ptr %30, i64 8, i32 0, i32 0, i32 0, i64 0)
+  %31 = call ptr @__hipstdpar_hidden_mmap(ptr %30, i64 8, i32 0, i32 0, i32 0, i64 0)
+  ; CHECK: %32 = call i32 @munmap(ptr %31, i64 8)
+  %32 = call i32 @__hipstdpar_hidden_munmap(ptr %31, i64 8)
+  ; CHECK: call void @__libc_free(ptr noundef %30)
+  call void @__hipstdpar_hidden_free(ptr noundef %30)
+
+  ret i32 %32
 }
 
 declare noalias ptr @aligned_alloc(i64 noundef, i64 noundef)
@@ -220,4 +244,8 @@ declare void @__libc_free(ptr noundef)
 
 declare ptr @__libc_malloc(i64 noundef)
 
-declare ptr @__libc_memalign(i64 noundef, i64 noundef)
-\ No newline at end of file
+declare ptr @__libc_memalign(i64 noundef, i64 noundef)
+
+declare ptr @mmap(ptr noundef, i64 noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef)
+
+declare i32 @munmap(ptr noundef, i64 noundef)
diff --git a/llvm/test/Transforms/InstSimplify/frexp.ll b/llvm/test/Transforms/InstCombine/frexp.ll
index 34cfce9..6541f0d 100644
--- a/llvm/test/Transforms/InstSimplify/frexp.ll
+++ b/llvm/test/Transforms/InstCombine/frexp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -passes=instsimplify %s | FileCheck %s
+; RUN: opt -S -passes=instcombine %s | FileCheck %s
 
 declare { float, i32 } @llvm.frexp.f32.i32(float)
 declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>)
@@ -12,7 +12,8 @@ define { float, i32 } @frexp_frexp(float %x) {
 ; CHECK-LABEL: define { float, i32 } @frexp_frexp(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:    [[FREXP0:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
-; CHECK-NEXT:    ret { float, i32 } [[FREXP0]]
+; CHECK-NEXT:    [[FREXP1:%.*]] = insertvalue { float, i32 } [[FREXP0]], i32 0, 1
+; CHECK-NEXT:    ret { float, i32 } [[FREXP1]]
 ;
   %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x)
   %frexp0.0 = extractvalue { float, i32 } %frexp0, 0
@@ -24,7 +25,8 @@ define { <2 x float>, <2 x i32> } @frexp_frexp_vector(<2 x float> %x) {
 ; CHECK-LABEL: define { <2 x float>, <2 x i32> } @frexp_frexp_vector(
 ; CHECK-SAME: <2 x float> [[X:%.*]]) {
 ; CHECK-NEXT:    [[FREXP0:%.*]] = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> [[X]])
-; CHECK-NEXT:    ret { <2 x float>, <2 x i32> } [[FREXP0]]
+; CHECK-NEXT:    [[FREXP1:%.*]] = insertvalue { <2 x float>, <2 x i32> } [[FREXP0]], <2 x i32> zeroinitializer, 1
+; CHECK-NEXT:    ret { <2 x float>, <2 x i32> } [[FREXP1]]
 ;
   %frexp0 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %x)
   %frexp0.0 = extractvalue { <2 x float>, <2 x i32> } %frexp0, 0
@@ -47,7 +49,8 @@ define { <vscale x 2 x float>, <vscale x 2 x i32> } @frexp_frexp_scalable_vector
 ; CHECK-LABEL: define { <vscale x 2 x float>, <vscale x 2 x i32> } @frexp_frexp_scalable_vector(
 ; CHECK-SAME: <vscale x 2 x float> [[X:%.*]]) {
 ; CHECK-NEXT:    [[FREXP0:%.*]] = call { <vscale x 2 x float>, <vscale x 2 x i32> } @llvm.frexp.nxv2f32.nxv2i32(<vscale x 2 x float> [[X]])
-; CHECK-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x i32> } [[FREXP0]]
+; CHECK-NEXT:    [[FREXP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x i32> } [[FREXP0]], <vscale x 2 x i32> zeroinitializer, 1
+; CHECK-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x i32> } [[FREXP1]]
 ;
   %frexp0 = call { <vscale x 2 x float>, <vscale x 2 x i32> } @llvm.frexp.nxv2f32.nxv2i32(<vscale x 2 x float> %x)
   %frexp0.0 = extractvalue { <vscale x 2 x float>, <vscale x 2 x i32> } %frexp0, 0
diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll
index 3ff4f9a..862853f 100644
--- a/llvm/test/Transforms/InstCombine/fsh.ll
+++ b/llvm/test/Transforms/InstCombine/fsh.ll
@@ -1084,3 +1084,72 @@ define i8 @fshl_range_trunc(i1 %x) {
   %tr = trunc nsw i32 %fshl to i8
   ret i8 %tr
 }
+
+;; Issue #138334 negative rotate amounts can be folded into the opposite direction
+define i32 @fshl_neg_amount(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshl_neg_amount(
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[X]], i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %n = sub i32 0, %y
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %n)
+  ret i32 %r
+}
+
+define i32 @fshr_neg_amount(i32 %x, i32 %y) {
+; CHECK-LABEL: @fshr_neg_amount(
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %n = sub i32 0, %y
+  %r = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 %n)
+  ret i32 %r
+}
+
+;; negative test, funnel shift is not a rotate
+
+define i32 @fshl_neg_amount_non_rotate(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @fshl_neg_amount_non_rotate(
+; CHECK-NEXT:    [[N:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Z:%.*]], i32 [[N]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %n = sub i32 0, %y
+  %r = call i32 @llvm.fshl.i32(i32 %x, i32 %z, i32 %n)
+  ret i32 %r
+}
+
+define i32 @fshr_neg_amount_non_rotate(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @fshr_neg_amount_non_rotate(
+; CHECK-NEXT:    [[N:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[Z:%.*]], i32 [[N]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %n = sub i32 0, %y
+  %r = call i32 @llvm.fshr.i32(i32 %x, i32 %z, i32 %n)
+  ret i32 %r
+}
+
+;; negative test, bitwidth is not a power of two
+
+define i31 @fshl_neg_amount_non_power_two(i31 %x, i31 %y) {
+; CHECK-LABEL: @fshl_neg_amount_non_power_two(
+; CHECK-NEXT:    [[N:%.*]] = sub i31 0, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call i31 @llvm.fshl.i31(i31 [[X:%.*]], i31 [[X]], i31 [[N]])
+; CHECK-NEXT:    ret i31 [[R]]
+;
+  %n = sub i31 0, %y
+  %r = call i31 @llvm.fshl.i31(i31 %x, i31 %x, i31 %n)
+  ret i31 %r
+}
+
+define i31 @fshr_neg_amount_non_power_two(i31 %x, i31 %y) {
+; CHECK-LABEL: @fshr_neg_amount_non_power_two(
+; CHECK-NEXT:    [[N:%.*]] = sub i31 0, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call i31 @llvm.fshr.i31(i31 [[X:%.*]], i31 [[X]], i31 [[N]])
+; CHECK-NEXT:    ret i31 [[R]]
+;
+  %n = sub i31 0, %y
+  %r = call i31 @llvm.fshr.i31(i31 %x, i31 %x, i31 %n)
+  ret i31 %r
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll
index 67a2cf2..91d5c52 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll
@@ -1,4 +1,4 @@
-; RUN: opt -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s
+; RUN: opt -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll
index d0d0d78..bdb89fb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-finite.ll
@@ -1,4 +1,4 @@
-; RUN: opt -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s
+; RUN: opt -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll
index 7a0e44c..e066130 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls.ll
@@ -1,4 +1,4 @@
-; RUN: opt -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s
+; RUN: opt -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index b7eef89..a1f660d 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -1,12 +1,15 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=SVML -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SVML
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=AMDLIBM -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,AMDLIBM
 ; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -vector-library=MASSV -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,MASSV
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-X86
+; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=LIBMVEC-AARCH64
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-X86
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=Accelerate -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ACCELERATE
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI
 ; RUN: opt -mtriple=riscv64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI_RISCV
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=ArmPL -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ARMPL
 
+; LIBMVEC-AARCH64-NOT: llvm.compiler.used
+
 ; COMMON-LABEL: @llvm.compiler.used = appending global
 ; SVML-SAME:        [6 x ptr] [
 ; SVML-SAME:          ptr @__svml_sin2,
@@ -193,6 +196,9 @@ declare float @llvm.log10.f32(float) #0
 ; MASSV: declare <2 x double> @__sind2(<2 x double>)
 ; MASSV: declare <4 x float> @__log10f4(<4 x float>)
 
+; LIBMVEC-AARCH64-NOT: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
+; LIBMVEC-AARCH64-NOT: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
+
 ; LIBMVEC-X86: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
 ; LIBMVEC-X86: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
 
diff --git a/llvm/test/tools/llvm-diff/uselistorder-issue58629-gv.ll b/llvm/test/tools/llvm-diff/uselistorder-issue58629-gv.ll
deleted file mode 100644
index 33216bb..0000000
--- a/llvm/test/tools/llvm-diff/uselistorder-issue58629-gv.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llvm-diff %s %s | count 0
-; Make sure there is no error produced by using uselistorder with two
-; modules using the same constant/global in the same context.
-
-@gv = addrspace(4) global [2 x i64] zeroinitializer, align 16
-
-define void @func() {
-entry:
-  %gep0 = getelementptr inbounds i8, ptr addrspace(4) @gv, i64 12
-  %gep1 = getelementptr i8, ptr addrspace(4) @gv, i64 4
-  ret void
-}
-
-uselistorder ptr addrspace(4) @gv, { 1, 0 }
diff --git a/llvm/test/tools/llvm-diff/uselistorder-issue58629.ll b/llvm/test/tools/llvm-diff/uselistorder-issue58629.ll
index d50b0dc..e89fc7a 100644
--- a/llvm/test/tools/llvm-diff/uselistorder-issue58629.ll
+++ b/llvm/test/tools/llvm-diff/uselistorder-issue58629.ll
@@ -1,6 +1,5 @@
-; RUN: llvm-diff %s %s | count 0
-; Make sure there is no error produced by using uselistorder with two
-; modules using the same constant in the same context.
+; XFAIL: *
+; RUN: llvm-diff %s %s
 
 define void @func() {
 entry:
diff --git a/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll b/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll
index 4e8d1cf..ac98d75 100644
--- a/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll
+++ b/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll
@@ -11,21 +11,20 @@
 
 ; RUN: FileCheck -check-prefix=RESULT %s < %t.reduced.ll
 
-@gv = external global i32, align 4
 
-; INTERESTING: getelementptr
-; INTERESTING: getelementptr
-; INTERESTING: getelementptr
-define ptr @func(i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
+; INTERESTING: add
+; INTERESTING: add
+; INTERESTING: add
+define i32 @func(i32 %arg0, i32 %arg1) {
 entry:
-  %add0 = getelementptr i8, ptr @gv, i32 %arg0
-  %add1 = getelementptr i8, ptr @gv, i32 %arg1
-  %add2 = getelementptr i8, ptr @gv, i32 %arg2
-  %add3 = getelementptr i8, ptr @gv, i32 %arg3
-  %add4 = getelementptr i8, ptr @gv, i32 %arg4
-  ret ptr %add4
+  %add0 = add i32 %arg0, 0
+  %add1 = add i32 %add0, 0
+  %add2 = add i32 %add1, 0
+  %add3 = add i32 %arg1, 0
+  %add4 = add i32 %add2, %add3
+  ret i32 %add4
 }
 
 ; INTERESTING: uselistorder
 ; RESULT: uselistorder
-uselistorder ptr @gv, { 3, 2, 4, 1, 0 }
+uselistorder i32 0, { 3, 2, 1, 0 }
diff --git a/llvm/test/tools/llvm-reduce/uselistorder-invalid-ir-output.ll b/llvm/test/tools/llvm-reduce/uselistorder-invalid-ir-output.ll
index 0e9c321..4bc862b 100644
--- a/llvm/test/tools/llvm-reduce/uselistorder-invalid-ir-output.ll
+++ b/llvm/test/tools/llvm-reduce/uselistorder-invalid-ir-output.ll
@@ -7,11 +7,10 @@
 ; RUN:   --test-arg %s
 
 ; Check if the final output really parses
-; RUN: llvm-as -o /dev/null %t.reduced.ll
+; RUN: not llvm-as -o /dev/null %t.reduced.ll
 ; RUN: FileCheck --check-prefix=RESULT %s < %t.reduced.ll
 
 
-; RESULT-LABEL: define void @kernel_ocl_path_trace_direct_lighting(
 define void @kernel_ocl_path_trace_direct_lighting(i1 %cond.i, i1 %cmp5.i.i, i32 %arg) {
 ; INTERESTING: entry:
 ; INTERESTING: 0
@@ -49,5 +48,4 @@ kernel_direct_lighting.exit:
   ret void
 }
 
-; FIXME: Should probably fix test to use a global address
-; RESULT-NOT: uselistorder
+; RESULT: uselistorder i32 0, { 4, 0, 5, 1, 6, 2, 7, 3 }
diff --git a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
index be8ab47..c2810b9 100644
--- a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
+++ b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
@@ -245,9 +245,6 @@ ValueMapping::ValueMapping(const Module &M) {
 }
 
 void ValueMapping::map(const Value *V) {
-  if (!V->hasUseList())
-    return;
-
   if (IDs.lookup(V))
     return;
 
@@ -398,9 +395,6 @@ static void verifyUseListOrder(const Module &M) {
 
 static void shuffleValueUseLists(Value *V, std::minstd_rand0 &Gen,
                                  DenseSet<Value *> &Seen) {
-  if (!V->hasUseList())
-    return;
-
   if (!Seen.insert(V).second)
     return;
 
@@ -443,9 +437,6 @@ static void shuffleValueUseLists(Value *V, std::minstd_rand0 &Gen,
 }
 
 static void reverseValueUseLists(Value *V, DenseSet<Value *> &Seen) {
-  if (!V->hasUseList())
-    return;
-
   if (!Seen.insert(V).second)
     return;
 
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index a4c0455..b9d519a 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -10,6 +10,7 @@
 #include "CountCopyAndMove.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseMapInfoVariant.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -359,6 +360,51 @@ TYPED_TEST(DenseMapTest, ConstIteratorTest) {
   EXPECT_TRUE(cit == cit2);
 }
 
+TYPED_TEST(DenseMapTest, KeysValuesIterator) {
+  SmallSet<typename TypeParam::key_type, 10> Keys;
+  SmallSet<typename TypeParam::mapped_type, 10> Values;
+  for (int I = 0; I < 10; ++I) {
+    auto K = this->getKey(I);
+    auto V = this->getValue(I);
+    Keys.insert(K);
+    Values.insert(V);
+    this->Map[K] = V;
+  }
+
+  SmallSet<typename TypeParam::key_type, 10> ActualKeys;
+  SmallSet<typename TypeParam::mapped_type, 10> ActualValues;
+  for (auto K : this->Map.keys())
+    ActualKeys.insert(K);
+  for (auto V : this->Map.values())
+    ActualValues.insert(V);
+
+  EXPECT_EQ(Keys, ActualKeys);
+  EXPECT_EQ(Values, ActualValues);
+}
+
+TYPED_TEST(DenseMapTest, ConstKeysValuesIterator) {
+  SmallSet<typename TypeParam::key_type, 10> Keys;
+  SmallSet<typename TypeParam::mapped_type, 10> Values;
+  for (int I = 0; I < 10; ++I) {
+    auto K = this->getKey(I);
+    auto V = this->getValue(I);
+    Keys.insert(K);
+    Values.insert(V);
+    this->Map[K] = V;
+  }
+
+  const TypeParam &ConstMap = this->Map;
+  SmallSet<typename TypeParam::key_type, 10> ActualKeys;
+  SmallSet<typename TypeParam::mapped_type, 10> ActualValues;
+  for (auto K : ConstMap.keys())
+    ActualKeys.insert(K);
+  for (auto V : ConstMap.values())
+    ActualValues.insert(V);
+
+  EXPECT_EQ(Keys, ActualKeys);
+  EXPECT_EQ(Values, ActualValues);
+}
+
 // Test initializer list construction.
 TEST(DenseMapCustomTest, InitializerList) {
   DenseMap<int, int> M({{0, 0}, {0, 1}, {1, 2}});
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 285f342..e1baa38 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -1618,6 +1618,62 @@ TEST_F(ComputeKnownFPClassTest, FMulNoZero) {
   expectKnownFPClass(fcAllFlags, std::nullopt, A7);
 }
 
+TEST_F(ComputeKnownFPClassTest, MinimumNumSignBit) {
+  parseAssembly(
+      R"(
+      define float @test(
+          float %unknown,
+          float nofpclass(nan) %nnan,
+          float nofpclass(nan pinf pnorm psub pzero) %nnan.nopos,
+          float nofpclass(nan ninf nnorm nsub nzero) %nnan.noneg,
+          float nofpclass(ninf nnorm nsub nzero) %noneg,
+          float nofpclass(pinf pnorm psub pzero) %nopos) {
+        %A = call float @llvm.minimumnum.f32(float %nnan.nopos, float %unknown)
+        %A2 = call float @llvm.minimumnum.f32(float %unknown, float %nnan.nopos)
+        %A3 = call float @llvm.minimumnum.f32(float %nnan.noneg, float %unknown)
+        %A4 = call float @llvm.minimumnum.f32(float %unknown, float %nnan.noneg)
+        %A5 = call float @llvm.minimumnum.f32(float %nnan.nopos, float %nnan.noneg)
+        %A6 = call float @llvm.minimumnum.f32(float %nopos, float %nnan.noneg)
+        %A7 = call float @llvm.minimumnum.f32(float %nnan.nopos, float %noneg)
+        ret float %A
+      })");
+  expectKnownFPClass(fcNegative, true, A);
+  expectKnownFPClass(fcNegative, true, A2);
+  expectKnownFPClass(~fcNan, std::nullopt, A3);
+  expectKnownFPClass(~fcNan, std::nullopt, A4);
+  expectKnownFPClass(fcNegative, true, A5);
+  expectKnownFPClass(~fcNan, std::nullopt, A6);
+  expectKnownFPClass(fcNegative, true, A7);
+}
+
+TEST_F(ComputeKnownFPClassTest, MaximumNumSignBit) {
+  parseAssembly(
+      R"(
+    define float @test(
+        float %unknown,
+        float nofpclass(nan) %nnan,
+        float nofpclass(nan pinf pnorm psub pzero) %nnan.nopos,
+        float nofpclass(nan ninf nnorm nsub nzero) %nnan.noneg,
+        float nofpclass(ninf nnorm nsub nzero) %noneg,
+        float nofpclass(pinf pnorm psub pzero) %nopos) {
+      %A = call float @llvm.maximumnum.f32(float %nnan.noneg, float %unknown)
+      %A2 = call float @llvm.maximumnum.f32(float %unknown, float %nnan.noneg)
+      %A3 = call float @llvm.maximumnum.f32(float %nnan.nopos, float %unknown)
+      %A4 = call float @llvm.maximumnum.f32(float %unknown, float %nnan.nopos)
+      %A5 = call float @llvm.maximumnum.f32(float %nnan.noneg, float %nnan.nopos)
+      %A6 = call float @llvm.maximumnum.f32(float %noneg, float %nnan.nopos)
+      %A7 = call float @llvm.maximumnum.f32(float %nnan.noneg, float %nopos)
+      ret float %A
+    })");
+  expectKnownFPClass(fcPositive, false, A);
+  expectKnownFPClass(fcPositive, false, A2);
+  expectKnownFPClass(~fcNan, std::nullopt, A3);
+  expectKnownFPClass(~fcNan, std::nullopt, A4);
+  expectKnownFPClass(fcPositive, false, A5);
+  expectKnownFPClass(~fcNan, std::nullopt, A6);
+  expectKnownFPClass(fcPositive, false, A7);
+}
+
 TEST_F(ComputeKnownFPClassTest, Phi) {
   parseAssembly(
       "define float @test(i1 %cond, float nofpclass(nan inf) %arg0, float nofpclass(nan) %arg1) {\n"
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 41cc212..a46178a 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -21,44 +21,6 @@
 namespace llvm {
 namespace {
 
-// Check that use count checks treat ConstantData like they have no uses.
-TEST(ConstantsTest, UseCounts) {
-  LLVMContext Context;
-  Type *Int32Ty = Type::getInt32Ty(Context);
-  Constant *Zero = ConstantInt::get(Int32Ty, 0);
-
-  EXPECT_TRUE(Zero->use_empty());
-  EXPECT_EQ(Zero->getNumUses(), 0u);
-  EXPECT_TRUE(Zero->hasNUses(0));
-  EXPECT_FALSE(Zero->hasOneUse());
-  EXPECT_FALSE(Zero->hasOneUser());
-  EXPECT_FALSE(Zero->hasNUses(1));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(1));
-  EXPECT_FALSE(Zero->hasNUses(2));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(2));
-
-  std::unique_ptr<Module> M(new Module("MyModule", Context));
-
-  // Introduce some uses
-  new GlobalVariable(*M, Int32Ty, /*isConstant=*/false,
-                     GlobalValue::ExternalLinkage, /*Initializer=*/Zero,
-                     "gv_user0");
-  new GlobalVariable(*M, Int32Ty, /*isConstant=*/false,
-                     GlobalValue::ExternalLinkage, /*Initializer=*/Zero,
-                     "gv_user1");
-
-  // Still looks like use_empty with uses.
-  EXPECT_TRUE(Zero->use_empty());
-  EXPECT_EQ(Zero->getNumUses(), 0u);
-  EXPECT_TRUE(Zero->hasNUses(0));
-  EXPECT_FALSE(Zero->hasOneUse());
-  EXPECT_FALSE(Zero->hasOneUser());
-  EXPECT_FALSE(Zero->hasNUses(1));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(1));
-  EXPECT_FALSE(Zero->hasNUses(2));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(2));
-}
-
 TEST(ConstantsTest, Integer_i1) {
   LLVMContext Context;
   IntegerType *Int1 = IntegerType::get(Context, 1);
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index e409a3b..bbd12e6 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -1354,6 +1354,24 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::UnknownOS, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("aarch64-unknown-managarm-mlibc");
+  EXPECT_EQ(Triple::aarch64, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::Managarm, T.getOS());
+  EXPECT_EQ(Triple::Mlibc, T.getEnvironment());
+
+  T = Triple("x86_64-unknown-managarm-mlibc");
+  EXPECT_EQ(Triple::x86_64, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::Managarm, T.getOS());
+  EXPECT_EQ(Triple::Mlibc, T.getEnvironment());
+
+  T = Triple("riscv64-unknown-managarm-mlibc");
+  EXPECT_EQ(Triple::riscv64, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::Managarm, T.getOS());
+  EXPECT_EQ(Triple::Mlibc, T.getEnvironment());
+
   T = Triple("huh");
   EXPECT_EQ(Triple::UnknownArch, T.getArch());
 }
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.h b/llvm/utils/TableGen/Common/CodeGenTarget.h
index 682cc4e..da2f3e0 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.h
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.h
@@ -252,7 +252,7 @@ public:
   const Record *getValueType() const { return Ty; }
   unsigned getNumOperands() const { return NumOperands; }
   const std::string &getSelectFunc() const { return SelectFunc; }
-  const ArrayRef<const Record *> getRootNodes() const { return RootNodes; }
+  ArrayRef<const Record *> getRootNodes() const { return RootNodes; }
   bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
   unsigned getComplexity() const { return Complexity; }
   bool wantsRoot() const { return WantsRoot; }
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 41b01a1..ca56403 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2216,6 +2216,21 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     // values should be integral constants, with the '*' represented as a '-1'.
     void setTileForDeviceTypes(MLIRContext *, llvm::ArrayRef<DeviceType>,
                                mlir::ValueRange);
+
+    // Add a value to the 'vector' list with a current list of device_types.
+    void addVectorOperand(MLIRContext *, mlir::Value,
+                          llvm::ArrayRef<DeviceType>);
+    // Add an empty value to the 'vector' list with a current list of
+    // device_types. This is for the case where there is no expression specified
+    // in a 'vector'.
+    void addEmptyVector(MLIRContext *, llvm::ArrayRef<DeviceType>);
+    // Add a value to the 'worker' list with a current list of device_types.
+    void addWorkerNumOperand(MLIRContext *, mlir::Value,
+                             llvm::ArrayRef<DeviceType>);
+    // Add an empty value to the 'worker' list with a current list of
+    // device_types. This is for the case where there is no expression specified
+    // in a 'worker'.
+    void addEmptyWorker(MLIRContext *, llvm::ArrayRef<DeviceType>);
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index fca2629..d0a3f01 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -122,7 +122,7 @@ public:
     unsigned m = llvm::countr_zero(storage);
     return m == 64 ? -1 : m;
   }
-  unsigned max() const { return 64 - llvm::countl_zero(storage); }
+  unsigned max() const { return llvm::bit_width(storage); }
   unsigned count() const { return llvm::popcount(storage); }
   bool empty() const { return storage == 0; }
 };
diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index b72f0df..771de01 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -807,6 +807,11 @@ def Builtin_MemRef : Builtin_Type<"MemRef", "memref", [
       "ArrayRef<int64_t>":$shape, "Type":$elementType,
       CArg<"AffineMap">:$map,
       CArg<"Attribute", "{}">:$memorySpace)>,
+    /// [deprecated] `Attribute`-based form should be used instead.
+    TypeBuilderWithInferredContext<(ins
+      "ArrayRef<int64_t>":$shape, "Type":$elementType,
+      "AffineMap":$map,
+      "unsigned":$memorySpaceInd)>
   ];
   let extraClassDeclaration = [{
     using BaseMemRefType::clone;
@@ -1175,6 +1180,14 @@ def Builtin_UnrankedMemRef : Builtin_Type<"UnrankedMemRef", "unranked_memref", [
       Attribute nonDefaultMemorySpace = skipDefaultMemorySpace(memorySpace);
       return $_get(elementType.getContext(), elementType, nonDefaultMemorySpace);
     }]>,
+    /// [deprecated] `Attribute`-based form should be used instead.
+    TypeBuilderWithInferredContext<(ins "Type":$elementType,
+                                        "unsigned":$memorySpace), [{
+      // Convert deprecated integer-like memory space to Attribute.
+      Attribute memorySpaceAttr =
+          wrapIntegerMemorySpace(memorySpace, elementType.getContext());
+      return UnrankedMemRefType::get(elementType, memorySpaceAttr);
+    }]>
   ];
   let extraClassDeclaration = [{
     using BaseMemRefType::clone;
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index df1753a..0d4ba39 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -2097,12 +2097,10 @@ static LogicalResult generateCopy(
   // Check if a buffer was already created.
   bool existingBuf = fastBufferMap.count(memref) > 0;
   if (!existingBuf) {
-    Attribute fastMemorySpace;
-    if (copyOptions.fastMemorySpace != 0)
-      fastMemorySpace = prologue.getI64IntegerAttr(copyOptions.fastMemorySpace);
+    AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
     auto fastMemRefType =
         MemRefType::get(fastBufferShape, memRefType.getElementType(),
-                        MemRefLayoutAttrInterface{}, fastMemorySpace);
+                        fastBufferLayout, copyOptions.fastMemorySpace);
 
     // Create the fast memory space buffer just before the 'affine.for'
     // operation.
@@ -2177,12 +2175,8 @@ static LogicalResult generateCopy(
   } else {
     // DMA generation.
     // Create a tag (single element 1-d memref) for the DMA.
-    Attribute tagMemorySpace;
-    if (copyOptions.tagMemorySpace != 0)
-      tagMemorySpace = prologue.getI64IntegerAttr(copyOptions.tagMemorySpace);
-    auto tagMemRefType =
-        MemRefType::get({1}, top.getIntegerType(32),
-                        MemRefLayoutAttrInterface{}, tagMemorySpace);
+    auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
+                                         copyOptions.tagMemorySpace);
     auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
 
     SmallVector<Value, 4> tagIndices({zeroIndex});
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
index f2a64f5..26904f1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -298,20 +298,42 @@ getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo,
   return std::make_tuple(packedOperand, indexingMap);
 }
 
-/// Pack a genericOp and return it.
+/// This function is a helper subroutine to pack a genericOp and return it. It
+/// will create a new generic op with the packed operand and the packed output
+/// according to packInfo when we attempt to push down unpack or bubble up pack
+/// around it. Implicitly this will only work when a packInfo can be obtained.
+/// This make sure that we are only using this function on parallel permuted
+/// dimensions.
 static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp,
                                Value dest, AffineMap packedOutIndexingMap,
-                               const PackInfo &packInfo) {
+                               const PackInfo &packInfo,
+                               bool isFoldableUnpackPack) {
   Location loc = genericOp.getLoc();
   SmallVector<Value> inputOperands;
+  SmallVector<Value> inputOperandsFromUnpackedSource;
   SmallVector<AffineMap> indexingMaps;
   for (OpOperand *inputOperand : genericOp.getDpsInputOperands()) {
     auto [packedOperand, packedIndexingMap] = getOrCreatePackedViewOfOperand(
         rewriter, loc, packInfo, genericOp, inputOperand);
+    if (auto unpackOp = inputOperand->get().getDefiningOp<linalg::UnPackOp>()) {
+      inputOperandsFromUnpackedSource.push_back(unpackOp.getSource());
+    } else {
+      inputOperandsFromUnpackedSource.push_back(packedOperand);
+    }
     inputOperands.push_back(packedOperand);
     indexingMaps.push_back(packedIndexingMap);
   }
 
+  // If the pack and unpack op can be folded:
+  // 1) use unpack op source op for operand to fold unpack -> pack sequence.
+  // 2) init tensor of the generic op can be replaced by the destination of the
+  // pack op.
+  if (isFoldableUnpackPack) {
+    inputOperands = inputOperandsFromUnpackedSource;
+    if (auto destPack = dest.getDefiningOp<linalg::PackOp>())
+      dest = destPack.getDest();
+  }
+
   int64_t numInnerLoops = packInfo.getNumTiledLoops();
   SmallVector<utils::IteratorType> iterTypes =
       genericOp.getIteratorTypesArray();
@@ -447,8 +469,10 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, linalg::PackOp packOp,
                             .getDefiningOp<tensor::EmptyOp>()) {
     dest = packOpDest;
   }
+  // pack(unpack) isn't naively foldable because the unpack op can be from
+  // an arbitrary domain so we need to keep both.
   return packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap,
-                       *packInfo);
+                       *packInfo, /*isFoldableUnpackPack=*/false);
 }
 
 /// Wrapper pattern that applies bubbleUpPackOpThroughGenericOp method.
@@ -1085,8 +1109,12 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp,
   }
 
   // Pack the genericOp.
+  // pack(unpack) is foldable in this case. This is because in pushing down the
+  // unpack, by default we will populate an additional pack op after the unpack.
+  // This guarantees them to be foldable.
   GenericOp newGenericOp =
-      packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, *packInfo);
+      packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, *packInfo,
+                    /*isFoldableUnpackPack=*/true);
   Value newResult =
       newGenericOp.getTiedOpResult(newGenericOp.getDpsInitOperand(0));
 
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f26b3a5..9f4645a 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2720,6 +2720,34 @@ void acc::LoopOp::setTileForDeviceTypes(
   setTileOperandsSegments(segments);
 }
 
+void acc::LoopOp::addVectorOperand(
+    MLIRContext *context, mlir::Value newValue,
+    llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setVectorOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+      context, getVectorOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+      newValue, getVectorOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyVector(
+    MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setVectorAttr(addDeviceTypeAffectedOperandHelper(context, getVectorAttr(),
+                                                   effectiveDeviceTypes));
+}
+
+void acc::LoopOp::addWorkerNumOperand(
+    MLIRContext *context, mlir::Value newValue,
+    llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setWorkerNumOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+      context, getWorkerNumOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+      newValue, getWorkerNumOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyWorker(
+    MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setWorkerAttr(addDeviceTypeAffectedOperandHelper(context, getWorkerAttr(),
+                                                   effectiveDeviceTypes));
+}
+
 //===----------------------------------------------------------------------===//
 // DataOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index e202bb7..d47e360 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -484,6 +484,14 @@ bool mlir::detail::isSupportedMemorySpace(Attribute memorySpace) {
   return false;
 }
 
+Attribute mlir::detail::wrapIntegerMemorySpace(unsigned memorySpace,
+                                               MLIRContext *ctx) {
+  if (memorySpace == 0)
+    return nullptr;
+
+  return IntegerAttr::get(IntegerType::get(ctx, 64), memorySpace);
+}
+
 Attribute mlir::detail::skipDefaultMemorySpace(Attribute memorySpace) {
   IntegerAttr intMemorySpace = llvm::dyn_cast_or_null<IntegerAttr>(memorySpace);
   if (intMemorySpace && intMemorySpace.getValue() == 0)
@@ -575,6 +583,46 @@ MemRefType::getChecked(function_ref<InFlightDiagnostic()> emitErrorFn,
                           elementType, layout, memorySpace);
 }
 
+MemRefType MemRefType::get(ArrayRef<int64_t> shape, Type elementType,
+                           AffineMap map, unsigned memorySpaceInd) {
+
+  // Use default layout for empty map.
+  if (!map)
+    map = AffineMap::getMultiDimIdentityMap(shape.size(),
+                                            elementType.getContext());
+
+  // Wrap AffineMap into Attribute.
+  auto layout = AffineMapAttr::get(map);
+
+  // Convert deprecated integer-like memory space to Attribute.
+  Attribute memorySpace =
+      wrapIntegerMemorySpace(memorySpaceInd, elementType.getContext());
+
+  return Base::get(elementType.getContext(), shape, elementType, layout,
+                   memorySpace);
+}
+
+MemRefType
+MemRefType::getChecked(function_ref<InFlightDiagnostic()> emitErrorFn,
+                       ArrayRef<int64_t> shape, Type elementType, AffineMap map,
+                       unsigned memorySpaceInd) {
+
+  // Use default layout for empty map.
+  if (!map)
+    map = AffineMap::getMultiDimIdentityMap(shape.size(),
+                                            elementType.getContext());
+
+  // Wrap AffineMap into Attribute.
+  auto layout = AffineMapAttr::get(map);
+
+  // Convert deprecated integer-like memory space to Attribute.
+  Attribute memorySpace =
+      wrapIntegerMemorySpace(memorySpaceInd, elementType.getContext());
+
+  return Base::getChecked(emitErrorFn, elementType.getContext(), shape,
+                          elementType, layout, memorySpace);
+}
+
 LogicalResult MemRefType::verify(function_ref<InFlightDiagnostic()> emitError,
                                  ArrayRef<int64_t> shape, Type elementType,
                                  MemRefLayoutAttrInterface layout,
diff --git a/mlir/lib/IR/TypeDetail.h b/mlir/lib/IR/TypeDetail.h
index 938cd9f..1d65fcc 100644
--- a/mlir/lib/IR/TypeDetail.h
+++ b/mlir/lib/IR/TypeDetail.h
@@ -140,6 +140,9 @@ struct TupleTypeStorage final
 /// Checks if the memorySpace has supported Attribute type.
 bool isSupportedMemorySpace(Attribute memorySpace);
 
+/// Wraps deprecated integer memory space to the new Attribute form.
+Attribute wrapIntegerMemorySpace(unsigned memorySpace, MLIRContext *ctx);
+
 /// Replaces default memorySpace (integer == `0`) with empty Attribute.
 Attribute skipDefaultMemorySpace(Attribute memorySpace);
 
diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
index d83df3e..ab60539 100644
--- a/mlir/lib/TableGen/Pattern.cpp
+++ b/mlir/lib/TableGen/Pattern.cpp
@@ -303,6 +303,12 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse(
   case Kind::Operand: {
     assert(index < 0);
     auto *operand = cast<NamedTypeConstraint *>(op->getArg(getArgIndex()));
+    if (operand->isOptional()) {
+      auto repl =
+          formatv(fmt, formatv("({0}.empty() ? Value() : *{0}.begin())", name));
+      LLVM_DEBUG(dbgs() << repl << " (OptionalOperand)\n");
+      return std::string(repl);
+    }
     // If this operand is variadic and this SymbolInfo doesn't have a range
     // index, then return the full variadic operand_range. Otherwise, return
     // the value itself.
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index 19d4524..63f068d 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -455,13 +455,10 @@ func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56
 // CHECK:         %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]]
 // CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:      into %[[ARG0_EMPTY_UNPACK]]
-// CHECK:         %[[ARG0_EMPTY_PACK:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
-// CHECK:         %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG0_EMPTY_PACK]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
 // CHECK:         %[[RES:.+]] = linalg.generic
 // CHECK-SAME:      indexing_maps = [#[[$MAP]]]
-// CHECK-SAME:      outs(%[[PACKED_ARG0]]
+// CHECK-SAME:      outs(%[[EMPTY]]
 // CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[RES]]
 // CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:      into %[[UNPACKED_ARG0]]
@@ -485,22 +482,11 @@ func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56
 // CHECK-LABEL: func.func @unpack_on_input
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
-// CHECK:         %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32>
-// CHECK:         %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG0_UNPACK_EMPTY]]
-// CHECK:         %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
-// CHECK:         %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG1_PACK_EMPTY]]
-// CHECK:         %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
-// CHECK:         %[[ARG0_PACK:.+]] = linalg.pack %[[UNPACKED_ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG0_PACK_EMPTY]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
 // CHECK:         %[[RES:.+]] = linalg.generic
 // CHECK-SAME:      indexing_maps = [#[[$MAP]], #[[$MAP]]]
-// CHECK-SAME:      ins(%[[ARG0_PACK]]
-// CHECK-SAME:      outs(%[[ARG1_PACK]]
+// CHECK-SAME:      ins(%[[ARG0]]
+// CHECK-SAME:      outs(%[[EMPTY]]
 // CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[RES]]
 // CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:      into %[[ARG1]]
@@ -524,22 +510,11 @@ func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: t
 // CHECK-LABEL: func.func @unpack_element_type_change
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
-// CHECK:         %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32>
-// CHECK:         %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG0_UNPACK_EMPTY]]
-// CHECK:         %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf16>
-// CHECK:         %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG1_PACK_EMPTY]]
-// CHECK:         %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
-// CHECK:         %[[ARG0_PACK:.+]] = linalg.pack %[[UNPACKED_ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG0_PACK_EMPTY]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf16>
 // CHECK:         %[[RES:.+]] = linalg.generic
 // CHECK-SAME:      indexing_maps = [#[[$MAP]], #[[$MAP]]]
-// CHECK-SAME:      ins(%[[ARG0_PACK]]
-// CHECK-SAME:      outs(%[[ARG1_PACK]]
+// CHECK-SAME:      ins(%[[ARG0]]
+// CHECK-SAME:      outs(%[[EMPTY]]
 // CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[RES]]
 // CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:      into %[[ARG1]]
@@ -564,19 +539,11 @@ func.func @forward_tensor_empty(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x5
 // CHECK-LABEL: func.func @forward_tensor_empty
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK:         %[[FINAL_RES:.+]] = tensor.empty() : tensor<12x56x56x64xf32>
-// CHECK:         %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32>
-// CHECK:         %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG0_UNPACK_EMPTY]]
-// CHECK:         %[[DEST:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
-// CHECK:         %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
-// CHECK:         %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
-// CHECK-SAME:      into %[[ARG0_PACK_EMPTY]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32>
 // CHECK:         %[[RES:.+]] = linalg.generic
 // CHECK-SAME:      indexing_maps = [#[[$MAP]], #[[$MAP]]]
-// CHECK-SAME:      ins(%[[PACKED_ARG0]]
-// CHECK-SAME:      outs(%[[DEST]]
+// CHECK-SAME:      ins(%[[ARG0]]
+// CHECK-SAME:      outs(%[[EMPTY]]
 // CHECK:         %[[UNPACKED:.+]] = linalg.unpack %[[RES]]
 // CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:      into %[[FINAL_RES]]
@@ -810,12 +777,9 @@ func.func @unpack_empty_inner_dims(%arg0: tensor<12x64x56x56xf32>) -> tensor<12x
 }
 
 // CHECK-LABEL: func.func @unpack_empty_inner_dims
-// CHECK:         %[[UNPACKED_ARG0:.+]] = linalg.unpack
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = []
-// CHECK:         %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]]
-// CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = []
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]: tensor<12x64x56x56xf32>)
 // CHECK:         %[[RES:.+]] = linalg.generic
-// CHECK-SAME:      ins(%[[PACKED_ARG0]]
+// CHECK-SAME:      ins(%[[ARG0]]
 // CHECK:         %[[UNPACKED:.+]] = linalg.unpack %[[RES]]
 // CHECK-SAME:      outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = []
 
@@ -943,14 +907,10 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
 // CHECK:         %[[FINAL_RES:.+]] = tensor.empty() : tensor<16x540x960xi32>
 // CHECK:         %[[INIT:.+]] = tensor.empty() : tensor<1x540x960x16xi32>
-// CHECK:         %[[PACK_EMPTY:.+]] = tensor.empty() : tensor<1x1x1080x1920x16xi32>
-// CHECK:         %[[PACK_ARG0:.+]] = linalg.pack
-// CHECK-SAME:      inner_dims_pos = [1] inner_tiles = [16]
-// CHECK-SAME:      into %[[PACK_EMPTY]]
 // CHECK:         %[[POOL:.+]] = linalg.generic
 // CHECK-SAME:      indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
 // CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"]
-// CHECK-SAME:      ins(%[[PACK_ARG0]], %[[ARG1]]
+// CHECK-SAME:      ins(%[[ARG0]], %[[ARG1]]
 // CHECK-SAME:      outs(%[[INIT]]
 // CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[POOL]]
 // CHECK-SAME:      inner_dims_pos = [0] inner_tiles = [16]
@@ -1421,3 +1381,48 @@ func.func @no_push_down_unpack_through_non_divisible_expand(%5: tensor<384x32x8x
 // CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[ARG0]]
 // CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[UNPACK]] {{\[}}[0, 1], [2]] output_shape [256, 12, 256] : tensor<3072x256xf32> into tensor<256x12x256xf32>
 // CHECK:         return %[[EXPANDED]] : tensor<256x12x256xf32>
+
+// -----
+
+func.func @push_unpack_in_padded_domain_foldable(%arg0: tensor<8x8x4x8xf32>, %dest: tensor<?x64xf32>, %arg1: tensor<?x64xbf16>) -> tensor<?x64xbf16> {
+  %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %dest : tensor<8x8x4x8xf32> -> tensor<?x64xf32>
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack : tensor<?x64xf32>) outs(%arg1 : tensor<?x64xbf16>) {
+  ^bb0(%in: f32, %out: bf16):
+    %1 = arith.truncf %in : f32 to bf16
+    linalg.yield %1 : bf16
+  } -> tensor<?x64xbf16>
+  return %0 : tensor<?x64xbf16>
+}
+// CHECK-LABEL: func.func @push_unpack_in_padded_domain_foldable
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[ARG2:[a-zA-Z0-9]+]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty
+// CHECK:         %[[GENERIC:.+]] = linalg.generic
+// CHECK-SAME:    ins(%[[ARG0]] : tensor<8x8x4x8xf32>)
+// CHECK-SAME:    outs(%[[EMPTY]] : tensor<?x8x4x8xbf16>)
+// CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[GENERIC]]
+// CHECK-SAME:    into %[[ARG2]]
+// CHECK:         return %[[UNPACK]] : tensor<?x64xbf16>
+
+// -----
+
+func.func @push_unpack_in_padded_domain_out_used(%arg0: tensor<8x8x4x8xf32>, %arg1: tensor<?x64xf32>) -> tensor<?x64xf32> {
+  %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %arg1 : tensor<8x8x4x8xf32> -> tensor<?x64xf32>
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack : tensor<?x64xf32>) outs(%arg1 : tensor<?x64xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %1 = arith.addf %in, %out : f32
+    linalg.yield %1 : f32
+  } -> tensor<?x64xf32>
+  return %0 : tensor<?x64xf32>
+}
+// CHECK-LABEL: func.func @push_unpack_in_padded_domain_out_used
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty
+// CHECK:         %[[GENERIC:.+]] = linalg.generic
+// CHECK-SAME:    ins(%[[ARG0]] : tensor<8x8x4x8xf32>)
+// CHECK-SAME:    outs(%[[EMPTY]] : tensor<?x8x4x8xf32>)
+// CHECK:         %[[UNPACK2:.+]] = linalg.unpack %[[GENERIC]]
+// CHECK-SAME:    into %[[ARG1]]
+// CHECK:         return %[[UNPACK2]] : tensor<?x64xf32>
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index b0a5527..38771f2 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -19,10 +19,10 @@
 // CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)>
 // CHECK-SCF-IF-DAG: #[[$TIMES4:.*]] = affine_map<()[s0] -> (s0 * 4)>
 // CHECK-SCF-IF-DAG: #[[$TIMES8:.*]] = affine_map<()[s0] -> (s0 * 8)>
-// CHECK-SCF-IF-DAG: memref.global "private" @__shared_32xf32 : memref<32xf32, #gpu.address_space<workgroup>>
-// CHECK-SCF-IF-DAG: memref.global "private" @__shared_64xf32 : memref<64xf32, #gpu.address_space<workgroup>>
-// CHECK-SCF-IF-DAG: memref.global "private" @__shared_128xf32 : memref<128xf32, #gpu.address_space<workgroup>>
-// CHECK-SCF-IF-DAG: memref.global "private" @__shared_256xf32 : memref<256xf32, #gpu.address_space<workgroup>>
+// CHECK-SCF-IF-DAG: memref.global "private" @__shared_32xf32 : memref<32xf32, 3>
+// CHECK-SCF-IF-DAG: memref.global "private" @__shared_64xf32 : memref<64xf32, 3>
+// CHECK-SCF-IF-DAG: memref.global "private" @__shared_128xf32 : memref<128xf32, 3>
+// CHECK-SCF-IF-DAG: memref.global "private" @__shared_256xf32 : memref<256xf32, 3>
 
 // CHECK-SCF-IF-LABEL: func @rewrite_warp_op_to_scf_if(
 //  CHECK-SCF-IF-SAME:     %[[laneid:.*]]: index,
@@ -47,8 +47,8 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
   %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
       args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) {
     ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>):
-//       CHECK-SCF-IF:     %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, #gpu.address_space<workgroup>>, vector<256xf32>
-//       CHECK-SCF-IF:     %[[arg0:.*]] = vector.transfer_read %[[buffer_v0]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, #gpu.address_space<workgroup>>, vector<128xf32>
+//       CHECK-SCF-IF:     %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, 3>, vector<256xf32>
+//       CHECK-SCF-IF:     %[[arg0:.*]] = vector.transfer_read %[[buffer_v0]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, 3>, vector<128xf32>
 //       CHECK-SCF-IF:     %[[def_0:.*]] = "some_def"(%[[arg0]]) : (vector<128xf32>) -> vector<32xf32>
 //       CHECK-SCF-IF:     %[[def_1:.*]] = "some_def"(%[[arg1]]) : (vector<256xf32>) -> vector<64xf32>
     %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32>
@@ -60,8 +60,8 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
 //       CHECK-SCF-IF:   }
 //       CHECK-SCF-IF:   gpu.barrier
 //       CHECK-SCF-IF:   %[[o1:.*]] = affine.apply #[[$TIMES2]]()[%[[laneid]]]
-//       CHECK-SCF-IF:   %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, #gpu.address_space<workgroup>>, vector<2xf32>
-//       CHECK-SCF-IF:   %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, #gpu.address_space<workgroup>>, vector<1xf32>
+//       CHECK-SCF-IF:   %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, 3>, vector<2xf32>
+//       CHECK-SCF-IF:   %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, 3>, vector<1xf32>
 //       CHECK-SCF-IF:   "some_use"(%[[r0]]) : (vector<1xf32>) -> ()
 //       CHECK-SCF-IF:   "some_use"(%[[r1]]) : (vector<2xf32>) -> ()
   "some_use"(%r#0) : (vector<1xf32>) -> ()
@@ -1065,18 +1065,18 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v
       args(%s0, %v0, %v1, %v2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) {
     ^bb0(%bs0: f32, %bv0: vector<f32>, %bv1: vector<1xf32>, %bv2: vector<1x1xf32>):
 
-      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, #gpu.address_space<workgroup>>, vector<1x1xf32>
-      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, #gpu.address_space<workgroup>>, vector<1xf32>
-      // CHECK-SCF-IF: vector.transfer_read {{.*}}[]{{.*}} : memref<f32, #gpu.address_space<workgroup>>, vector<f32>
-      // CHECK-SCF-IF: memref.load {{.*}}[%[[C0]]] : memref<1xf32, #gpu.address_space<workgroup>>
+      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
+      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
+      // CHECK-SCF-IF: vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
+      // CHECK-SCF-IF: memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3>
       // CHECK-SCF-IF: "some_def_0"(%{{.*}}) : (f32) -> f32
       // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<f32>) -> vector<f32>
       // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
       // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1x1xf32>) -> vector<1x1xf32>
-      // CHECK-SCF-IF: memref.store {{.*}}[%[[C0]]] : memref<1xf32, #gpu.address_space<workgroup>>
-      // CHECK-SCF-IF: vector.transfer_write {{.*}}[] : vector<f32>, memref<f32, #gpu.address_space<workgroup>>
-      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]]] {in_bounds = [true]} : vector<1xf32>, memref<1xf32, #gpu.address_space<workgroup>>
-      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<1x1xf32>, memref<1x1xf32, #gpu.address_space<workgroup>>
+      // CHECK-SCF-IF: memref.store {{.*}}[%[[C0]]] : memref<1xf32, 3>
+      // CHECK-SCF-IF: vector.transfer_write {{.*}}[] : vector<f32>, memref<f32, 3>
+      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]]] {in_bounds = [true]} : vector<1xf32>, memref<1xf32, 3>
+      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<1x1xf32>, memref<1x1xf32, 3>
 
       %rs0 = "some_def_0"(%bs0) : (f32) -> f32
       %rv0 = "some_def_1"(%bv0) : (vector<f32>) -> vector<f32>
@@ -1088,10 +1088,10 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v
   }
 
   // CHECK-SCF-IF: gpu.barrier
-  // CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, #gpu.address_space<workgroup>>, vector<1x1xf32>
-  // CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, #gpu.address_space<workgroup>>, vector<1xf32>
-  // CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref<f32, #gpu.address_space<workgroup>>, vector<f32>
-  // CHECK-SCF-IF: %[[RS0:.*]] = memref.load {{.*}}[%[[C0]]] : memref<1xf32, #gpu.address_space<workgroup>>
+  // CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
+  // CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
+  // CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
+  // CHECK-SCF-IF: %[[RS0:.*]] = memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3>
   // CHECK-SCF-IF: return %[[RS0]], %[[RV0]], %[[RV1]], %[[RV2]] : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
   return %r#0, %r#1, %r#2, %r#3 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
 }
@@ -1106,9 +1106,9 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
     -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
   // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index
 
-  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, #gpu.address_space<workgroup>>
+  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3>
   // CHECK-SCF-IF:  %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
-  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, #gpu.address_space<workgroup>>
+  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3>
   // CHECK-SCF-IF:  gpu.barrier
 
   // CHECK-SCF-IF: scf.if{{.*}}{
@@ -1116,12 +1116,12 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
       args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
     ^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>):
 
-  // CHECK-SCF-IF-DAG: %[[SR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, #gpu.address_space<workgroup>>, vector<32x64x1xf32>
-  // CHECK-SCF-IF-DAG: %[[SR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, #gpu.address_space<workgroup>>, vector<1x64x128xf32>
+  // CHECK-SCF-IF-DAG: %[[SR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<32x64x1xf32>
+  // CHECK-SCF-IF-DAG: %[[SR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x64x128xf32>
   //     CHECK-SCF-IF: %[[W0:.*]] = "some_def_0"(%[[SR0]]) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
   //     CHECK-SCF-IF: %[[W1:.*]] = "some_def_1"(%[[SR1]]) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>
-  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<32x64x1xf32>, memref<32x64x1xf32, #gpu.address_space<workgroup>>
-  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W1]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x64x128xf32>, memref<1x64x128xf32, #gpu.address_space<workgroup>>
+  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<32x64x1xf32>, memref<32x64x1xf32, 3>
+  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W1]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x64x128xf32>, memref<1x64x128xf32, 3>
 
       %r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
       %r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>
@@ -1132,8 +1132,8 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
 
   //     CHECK-SCF-IF: gpu.barrier
   //     CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
-  // CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<32x64x1xf32, #gpu.address_space<workgroup>>, vector<1x64x1xf32>
-  // CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf32, #gpu.address_space<workgroup>>, vector<1x2x128xf32>
+  // CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32>
+  // CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32>
   //     CHECK-SCF-IF: return %[[R0]], %[[R1]] : vector<1x64x1xf32>, vector<1x2x128xf32>
   return %r#0, %r#1 : vector<1x64x1xf32>, vector<1x2x128xf32>
 }
diff --git a/mlir/test/IR/invalid-custom-print-parse.mlir b/mlir/test/IR/invalid-custom-print-parse.mlir
index 00da145e..7bc6644 100644
--- a/mlir/test/IR/invalid-custom-print-parse.mlir
+++ b/mlir/test/IR/invalid-custom-print-parse.mlir
@@ -19,3 +19,10 @@ test.custom_dimension_list_attr dimension_list = [2x3]
 
 // expected-error @below {{expected attribute value}}
 test.optional_custom_attr foo
+
+// -----
+
+// expected-error @below {{unknown key '"foo"' when parsing properties dictionary}}
+test.op_with_enum_prop_attr_form <{value = 0 : i32, foo}>
+
+
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 85a49e0..3e46199 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1850,6 +1850,20 @@ def : Pat<
   (MixedVOperandOp5 $input2a, $input2b, $input1b, $attr1,
                     ConstantStrAttr<StrAttr, "MatchMultiVariadicSubSymbol">)>;
 
+def MixedVOperandOp7 : TEST_Op<"mixed_variadic_optional_in7",
+                               [AttrSizedOperandSegments]> {
+  let arguments = (ins
+    Variadic<I32>:$input1,
+    Optional<I32>:$input2,
+    I32Attr:$attr1
+  );
+}
+
+def : Pat<
+  (MixedVOperandOp7 $input1, $input2, ConstantAttr<I32Attr, "2">:$attr1),
+  (MixedVOperandOp6 $input1, (variadic $input2), $attr1),
+  [(Constraint<CPred<"$0 != Value()">> $input2)]>;
+
 //===----------------------------------------------------------------------===//
 // Test Patterns (either)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index b73c40a..eda2594 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -518,17 +518,15 @@ struct TestVectorScanLowering
 static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
                                         gpu::WarpExecuteOnLane0Op warpOp,
                                         Type type) {
-  Attribute sharedMemorySpaceAttr =
-      builder.getAttr<gpu::AddressSpaceAttr>(gpu::AddressSpace::Workgroup);
+  static constexpr int64_t kSharedMemorySpace = 3;
   // Compute type of shared memory buffer.
   MemRefType memrefType;
   if (auto vectorType = dyn_cast<VectorType>(type)) {
     memrefType =
-        MemRefType::get(vectorType.getShape(), vectorType.getElementType(),
-                        MemRefLayoutAttrInterface{}, sharedMemorySpaceAttr);
+        MemRefType::get(vectorType.getShape(), vectorType.getElementType(), {},
+                        kSharedMemorySpace);
   } else {
-    memrefType = MemRefType::get({1}, type, MemRefLayoutAttrInterface{},
-                                 sharedMemorySpaceAttr);
+    memrefType = MemRefType::get({1}, type, {}, kSharedMemorySpace);
   }
 
   // Get symbol table holding all shared memory globals.
diff --git a/mlir/test/mlir-tblgen/pattern.mlir b/mlir/test/mlir-tblgen/pattern.mlir
index 60d46e6..9090528 100644
--- a/mlir/test/mlir-tblgen/pattern.mlir
+++ b/mlir/test/mlir-tblgen/pattern.mlir
@@ -584,6 +584,16 @@ func.func @testMatchMultiVariadicSubSymbol(%arg0: i32, %arg1: i32, %arg2: i32, %
   return
 }
 
+// CHECK-LABEL: @testMatchMixedVaradicOptional
+func.func @testMatchMixedVaradicOptional(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32) -> () {
+  // CHECK: "test.mixed_variadic_in6"(%arg0, %arg1, %arg2) <{attr1 = 2 : i32}> : (i32, i32, i32) -> () 
+  "test.mixed_variadic_optional_in7"(%arg0, %arg1, %arg2) {attr1 = 2 : i32, operandSegmentSizes = array<i32: 2, 1>} : (i32, i32, i32) -> ()
+  // CHECK: test.mixed_variadic_optional_in7
+  "test.mixed_variadic_optional_in7"(%arg0, %arg1) {attr1 = 2 : i32, operandSegmentSizes = array<i32: 2, 0>} : (i32, i32) -> ()
+
+  return
+}
+
 //===----------------------------------------------------------------------===//
 // Test that natives calls are only called once during rewrites.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index a0d947f..0a9d14d 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1300,6 +1300,11 @@ if (!dict) {
   emitError() << "expected DictionaryAttr to set properties";
   return ::mlir::failure();
 }
+// keep track of used keys in the input dictionary to be able to error out
+// if there are some unknown ones.
+DenseSet<StringAttr> usedKeys;
+MLIRContext *ctx = dict.getContext();
+(void)ctx;
 )decl";
 
   // {0}: fromAttribute call
@@ -1310,7 +1315,9 @@ auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
          ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
   {0};
 };
-auto attr = dict.get("{1}");
+auto {1}AttrName = StringAttr::get(ctx, "{1}");
+usedKeys.insert({1}AttrName);
+auto attr = dict.get({1}AttrName);
 if (!attr && {2}) {{
   emitError() << "expected key entry for {1} in DictionaryAttr to set "
              "Properties.";
@@ -1356,7 +1363,9 @@ if (attr && ::mlir::failed(setFromAttr(prop.{1}, attr, emitError)))
     bool isRequired = !attr.isOptional() && !attr.hasDefaultValue();
     body << formatv(R"decl(
 auto &propStorage = prop.{0};
-auto attr = dict.get("{0}");
+auto {0}AttrName = StringAttr::get(ctx, "{0}");
+auto attr = dict.get({0}AttrName);
+usedKeys.insert(StringAttr::get(ctx, "{1}"));
 if (attr || /*isRequired=*/{1}) {{
   if (!attr) {{
     emitError() << "expected key entry for {0} in DictionaryAttr to set "
@@ -1374,7 +1383,14 @@ if (attr || /*isRequired=*/{1}) {{
 )decl",
                     namedAttr.name, isRequired);
   }
-  body << "return ::mlir::success();\n";
+  body << R"decl(
+for (NamedAttribute attr : dict) {
+  if (!usedKeys.contains(attr.getName()))
+    return emitError() << "unknown key '" << attr.getName() <<
+        "' when parsing properties dictionary";
+}
+return ::mlir::success();
+)decl";
 }
 
 void OperationFormat::genParser(Operator &op, OpClass &opClass) {
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index eba7bab..de422f8 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -2482,7 +2482,7 @@ __kmp_invoke_microtask:
 KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
     .4byte .gomp_critical_user_
 #ifdef __ELF__
-    .type KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),@object
+    .type KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),%object
     .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),4
 #endif
 #endif /* KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC32 */
@@ -2501,7 +2501,7 @@ KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
 KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
     .8byte .gomp_critical_user_
 #ifdef __ELF__
-    .type KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),@object
+    .type KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),%object
     .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
 #endif
 #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index a2328d1..73c5d95 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -601,9 +601,6 @@ bool polly::isHoistableLoad(LoadInst *LInst, Region &R, LoopInfo &LI,
     L = L->getParentLoop();
   }
 
-  if (!Ptr->hasUseList())
-    return true;
-
   for (auto *User : Ptr->users()) {
     auto *UserI = dyn_cast<Instruction>(User);
     if (!UserI || UserI->getFunction() != LInst->getFunction() ||
author	Vitaly Buka <vitalybuka@google.com>	2025-05-08 00:08:20 -0700
committer	Vitaly Buka <vitalybuka@google.com>	2025-05-08 00:08:20 -0700
commit	9ffd039f04cfa9bc5d3371b9c8cf86376a74c21e (patch)
tree	b16242994b810591bb9121e7a3fc272f6fb7a9c6
parent	d3d32f47328770ca72997086d44a59397d1ea015 (diff)
parent	c1f0e68cec4218c9d51a4ad0a6f6d878ed573dfe (diff)
download	llvm-users/vitalybuka/spr/main.nfcubsan_minimal-clang-format-a-file.zip llvm-users/vitalybuka/spr/main.nfcubsan_minimal-clang-format-a-file.tar.gz llvm-users/vitalybuka/spr/main.nfcubsan_minimal-clang-format-a-file.tar.bz2