326 files changed, 10652 insertions, 3360 deletions
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index c8f3312..4a8418d 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -66,11 +66,13 @@ start-group "ninja"
 
 # Targets are not escaped as they are passed as separate arguments.
 ninja -C "${BUILD_DIR}" -k 0 ${targets} |& tee ninja.log
+cp ${BUILD_DIR}/.ninja_log ninja.ninja_log
 
 if [[ "${runtime_targets}" != "" ]]; then
   start-group "ninja Runtimes"
 
   ninja -C "${BUILD_DIR}" ${runtime_targets} |& tee ninja_runtimes.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes.ninja_log
 fi
 
 # Compiling runtimes with just-built Clang and running their tests
@@ -87,6 +89,7 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} \
     |& tee ninja_runtimes_needs_reconfig1.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes_needs_reconig.ninja_log
 
   start-group "CMake Runtimes Clang Modules"
 
@@ -99,4 +102,5 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} \
     |& tee ninja_runtimes_needs_reconfig2.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes_needs_reconfig2.ninja_log
 fi
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index f85d6e3..219979dd 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -55,9 +55,11 @@ start-group "ninja"
 
 # Targets are not escaped as they are passed as separate arguments.
 ninja -C "${BUILD_DIR}" -k 0 ${targets} |& tee ninja.log
+cp ${BUILD_DIR}/.ninja_log ninja.ninja_log
 
 if [[ "${runtime_targets}" != "" ]]; then
   start-group "ninja runtimes"
   
   ninja -C "${BUILD_DIR}" -k 0 ${runtimes_targets} |& tee ninja_runtimes.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes.ninja_log
 fi
diff --git a/.ci/premerge_advisor_upload.py b/.ci/premerge_advisor_upload.py
index 84214f8..dda4ad2 100644
--- a/.ci/premerge_advisor_upload.py
+++ b/.ci/premerge_advisor_upload.py
@@ -23,11 +23,13 @@ def main(commit_sha, workflow_run_number, build_log_files):
     )
     test_failures = generate_test_report_lib.get_failures(junit_objects)
     source = "pull_request" if "GITHUB_ACTIONS" in os.environ else "postcommit"
+    current_platform = f"{platform.system()}-{platform.machine()}".lower()
     failure_info = {
         "source_type": source,
         "base_commit_sha": commit_sha,
         "source_id": workflow_run_number,
         "failures": [],
+        "platform": current_platform,
     }
     if test_failures:
         for name, failure_message in test_failures:
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 9aefcf2..375ed29 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -26,7 +26,7 @@ function at-exit {
   mkdir -p artifacts
   sccache --show-stats
   sccache --show-stats >> artifacts/sccache_stats.txt
-  cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log
+  cp "${MONOREPO_ROOT}"/*.ninja_log artifacts/ || :
   cp "${MONOREPO_ROOT}"/*.log artifacts/ || :
   cp "${BUILD_DIR}"/test-results.*.xml artifacts/ || :
 
diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index f1d5273..f07add3 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -22,7 +22,7 @@
 #undef USE_SSECRC
 #endif
 
-static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
+[[maybe_unused]] static inline size_t hash_int64_fallback(int64_t k) {
   uint64_t key = (unsigned long long)k;
   // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
   // http://www.concentric.net/~ttwang/tech/inthash.htm
@@ -35,7 +35,7 @@ static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
   return static_cast<size_t>(static_cast<uint32_t>(key));
 }
 
-static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64(int64_t k) {
+[[maybe_unused]] static inline size_t hash_int64(int64_t k) {
 #if defined(USE_SSECRC) && defined(__SSE4_2__)
   size_t h = 0;
   __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k));
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 24a4c85..2687788 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -101,7 +101,7 @@ std::optional<AttrInfo> findAttributeInfo(const DWARFDie DIE,
   return findAttributeInfo(DIE, AbbrevDecl, *Index);
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static void printLE64(const std::string &S) {
   for (uint32_t I = 0, Size = S.size(); I < Size; ++I) {
     errs() << Twine::utohexstr(S[I]);
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index a7a4b66..5e3fa93 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -69,7 +69,7 @@ static void printDie(const DWARFDie &DIE) {
 }
 
 /// Lazily parse DWARF DIE and print it out.
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static void printDie(DWARFUnit &DU, uint64_t DIEOffset) {
   uint64_t OriginalOffsets = DIEOffset;
   uint64_t NextCUOffset = DU.getNextUnitOffset();
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp
index 7816a09..5602932 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp
@@ -3,7 +3,7 @@
 // This test previously would cause a failed assertion because the structured
 // binding declaration had no valid type associated with it. This ensures the
 // expected clang diagnostic is generated instead.
-// CHECK-MESSAGES: :[[@LINE+1]]:6: error: decomposition declaration '[x]' requires an initializer [clang-diagnostic-error]
+// CHECK-MESSAGES: :[[@LINE+1]]:6: error: structured binding declaration '[x]' requires an initializer [clang-diagnostic-error]
 auto [x];
 
 struct S { int a; };
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index afe3d46..4f62a67 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -304,6 +304,8 @@ Attribute Changes in Clang
 
 Improvements to Clang's diagnostics
 -----------------------------------
+- Diagnostics messages now refer to ``structured binding`` instead of ``decomposition``,
+  to align with `P0615R0 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0615r0.html>`_ changing the term. (#GH157880)
 - Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic
   diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``).
   Moved the warning for a missing (though implied) attribute on a redeclaration into this group.
@@ -518,6 +520,7 @@ X86 Support
 - Remove `[no-]evex512` feature request from intrinsics and builtins.
 - Change features `avx10.x-[256,512]` to `avx10.x`.
 - `-march=wildcatlake` is now supported.
+- `-march=novalake` is now supported.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index dcfa4e3..fd0b304 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -3465,12 +3465,6 @@ Check for an out-of-bound pointer being returned to callers.
    return x; // warn: undefined or garbage returned
  }
 
-
-alpha.security.cert
-^^^^^^^^^^^^^^^^^^^
-
-SEI CERT checkers which tries to find errors based on their `C coding rules <https://wiki.sei.cmu.edu/confluence/display/c/2+Rules>`_.
-
 alpha.unix
 ^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index 5892566..f07861f 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -4378,8 +4378,6 @@ protected:
   unsigned NumRows;
   unsigned NumColumns;
 
-  static constexpr unsigned MaxElementsPerDimension = (1 << 20) - 1;
-
   ConstantMatrixType(QualType MatrixElementType, unsigned NRows,
                      unsigned NColumns, QualType CanonElementType);
 
@@ -4398,16 +4396,6 @@ public:
     return getNumRows() * getNumColumns();
   }
 
-  /// Returns true if \p NumElements is a valid matrix dimension.
-  static constexpr bool isDimensionValid(size_t NumElements) {
-    return NumElements > 0 && NumElements <= MaxElementsPerDimension;
-  }
-
-  /// Returns the maximum number of elements per dimension.
-  static constexpr unsigned getMaxElementsPerDimension() {
-    return MaxElementsPerDimension;
-  }
-
   void Profile(llvm::FoldingSetNodeID &ID) {
     Profile(ID, getElementType(), getNumRows(), getNumColumns(),
             getTypeClass());
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index c724136..e5e071f 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -505,7 +505,7 @@ def err_expected_coloncolon_after_super : Error<
   "expected '::' after '__super'">;
 
 def ext_decomp_decl_empty : ExtWarn<
-  "ISO C++17 does not allow a decomposition group to be empty">,
+  "ISO C++17 does not allow a structured binding group to be empty">,
   InGroup<DiagGroup<"empty-decomposition">>;
 
 def err_function_parameter_limit_exceeded : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 40bc7b9..12fd7b08 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -31,12 +31,12 @@ defm constexpr_body_multiple_return : CXX14Compat<
 defm variable_template : CXX14Compat<"variable templates are">;
 
 // C++17 compatibility with C++14 and earlier.
-defm decomp_decl : CXX17Compat<"decomposition declarations are">;
+defm decomp_decl : CXX17Compat<"structured binding declarations are">;
 defm inline_variable : CXX17Compat<"inline variables are">;
 
 // C++20 compatibility with C++17 and earlier.
 defm decomp_decl_spec
-    : CXX20Compat<"decomposition declaration declared '%0' is">;
+    : CXX20Compat<"structured binding declaration declared '%0' is">;
 defm constexpr_local_var_no_init : CXX20Compat<
   "uninitialized variable in a constexpr %select{function|constructor}0 is">;
 defm constexpr_function_try_block : CXX20Compat<
@@ -591,58 +591,58 @@ def warn_modifying_shadowing_decl :
 
 // C++ decomposition declarations
 def err_decomp_decl_context : Error<
-  "decomposition declaration not permitted in this context">;
+  "structured binding declaration not permitted in this context">;
 def err_decomp_decl_spec
-    : Error<"decomposition declaration cannot be declared '%0'">;
+    : Error<"structured binding declaration cannot be declared '%0'">;
 def err_decomp_decl_type : Error<
-  "decomposition declaration cannot be declared with type %0; "
+  "structured binding declaration cannot be declared with type %0; "
   "declared type must be 'auto' or reference to 'auto'">;
 def err_decomp_decl_constraint : Error<
-  "decomposition declaration cannot be declared with constrained 'auto'">;
+  "structured binding declaration cannot be declared with constrained 'auto'">;
 def err_decomp_decl_parens : Error<
-  "decomposition declaration cannot be declared with parentheses">;
+  "structured binding declaration cannot be declared with parentheses">;
 def err_decomp_decl_template : Error<
-  "decomposition declaration cannot be a template">;
+  "structured binding declaration cannot be a template">;
 def err_decomp_decl_not_alone : Error<
-  "decomposition declaration must be the only declaration in its group">;
+  "structured binding declaration must be the only declaration in its group">;
 def err_decomp_decl_requires_init : Error<
-  "decomposition declaration %0 requires an initializer">;
+  "structured binding declaration %0 requires an initializer">;
 def err_decomp_decl_wrong_number_bindings : Error<
-  "type %0 decomposes into %3 %plural{1:element|:elements}2, but "
+  "type %0 binds to %3 %plural{1:element|:elements}2, but "
   "%select{%plural{0:no|:only %1}1|%1}4 "
   "%plural{1:name was|:names were}1 provided">;
 def err_decomp_decl_unbindable_type : Error<
-  "cannot decompose %select{union|non-class, non-array}1 type %2">;
+  "cannot bind %select{union|non-class, non-array}1 type %2">;
 def err_decomp_decl_multiple_bases_with_members : Error<
-  "cannot decompose class type %1: "
+  "cannot bind class type %1: "
   "%select{its base classes %2 and|both it and its base class}0 %3 "
   "have non-static data members">;
 def err_decomp_decl_ambiguous_base : Error<
-  "cannot decompose members of ambiguous base class %1 of %0:%2">;
+  "cannot bind members of ambiguous base class %1 of %0:%2">;
 def err_decomp_decl_inaccessible_base : Error<
-  "cannot decompose members of inaccessible base class %1 of %0">,
+  "cannot bind members of inaccessible base class %1 of %0">,
   AccessControl;
 def err_decomp_decl_inaccessible_field : Error<
-  "cannot decompose %select{private|protected}0 member %1 of %3">,
+  "cannot bind %select{private|protected}0 member %1 of %3">,
   AccessControl;
 def err_decomp_decl_lambda : Error<
-  "cannot decompose lambda closure type">;
+  "cannot bind lambda closure type">;
 def err_decomp_decl_anon_union_member : Error<
-  "cannot decompose class type %0 because it has an anonymous "
+  "cannot bind class type %0 because it has an anonymous "
   "%select{struct|union}1 member">;
 def err_decomp_decl_std_tuple_element_not_specialized : Error<
-  "cannot decompose this type; 'std::tuple_element<%0>::type' "
+  "cannot bind this type; 'std::tuple_element<%0>::type' "
   "does not name a type">;
 def err_decomp_decl_std_tuple_size_not_constant : Error<
-  "cannot decompose this type; 'std::tuple_size<%0>::value' "
+  "cannot bind this type; 'std::tuple_size<%0>::value' "
   "is not a valid integral constant expression">;
 def err_decomp_decl_std_tuple_size_invalid
-    : Error<"cannot decompose this type; 'std::tuple_size<%0>::value' "
+    : Error<"cannot bind this type; 'std::tuple_size<%0>::value' "
             "is not a valid size: %1">;
 def note_in_binding_decl_init : Note<
   "in implicit initialization of binding declaration %0">;
 def err_arg_is_not_destructurable : Error<
-  "type %0 cannot be decomposed">;
+  "type %0 cannot be bound">;
 
 def err_std_type_trait_not_class_template : Error<
   "unsupported standard library implementation: "
@@ -2620,7 +2620,7 @@ def err_auto_variable_cannot_appear_in_own_initializer
           "declared with deduced type %2 cannot appear in its own initializer">;
 def err_binding_cannot_appear_in_own_initializer : Error<
   "binding %0 cannot appear in the initializer of its own "
-  "decomposition declaration">;
+  "structured binding declaration">;
 def err_new_array_of_auto : Error<
   "cannot allocate array of 'auto'">;
 def err_auto_not_allowed : Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 5f70b51..8d6b8a1 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -433,6 +433,7 @@ ENUM_LANGOPT(RegisterStaticDestructors, RegisterStaticDestructorsKind, 2,
 LANGOPT(RegCall4, 1, 0, NotCompatible, "Set __regcall4 as a default calling convention to respect __regcall ABI v.4")
 
 LANGOPT(MatrixTypes, 1, 0, NotCompatible, "Enable or disable the builtin matrix type")
+VALUE_LANGOPT(MaxMatrixDimension, 32, (1 << 20) - 1, NotCompatible, "maximum allowed matrix dimension")
 
 LANGOPT(CXXAssumptions, 1, 1, NotCompatible, "Enable or disable codegen and compile-time checks for C++23's [[assume]] attribute")
 
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index b4c24d7..3ac8987 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -127,6 +127,14 @@ public:
   cir::BoolType getBoolTy() { return cir::BoolType::get(getContext()); }
   cir::VoidType getVoidTy() { return cir::VoidType::get(getContext()); }
 
+  cir::IntType getUIntNTy(int n) {
+    return cir::IntType::get(getContext(), n, false);
+  }
+
+  cir::IntType getSIntNTy(int n) {
+    return cir::IntType::get(getContext(), n, true);
+  }
+
   cir::PointerType getPointerTo(mlir::Type ty) {
     return cir::PointerType::get(ty);
   }
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td
index 8f72ff4..2548d46 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td
@@ -39,13 +39,31 @@ def CIR_AnyIntOrFloatAttr : AnyAttrOf<[CIR_AnyIntAttr, CIR_AnyFPAttr],
 }
 
 //===----------------------------------------------------------------------===//
+// Exceptions constraints
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyCatchAllAttr
+    : CIR_AttrConstraint<"::cir::CatchAllAttr", "catch all attribute">;
+
+def CIR_AnyUnwindAttr
+    : CIR_AttrConstraint<"::cir::UnwindAttr", "unwind attribute">;
+
+//===----------------------------------------------------------------------===//
 // GlobalViewAttr constraints
 //===----------------------------------------------------------------------===//
 
-def CIR_AnyGlobalViewAttr : CIR_AttrConstraint<"::cir::GlobalViewAttr", "GlobalView attribute">;
+def CIR_AnyGlobalViewAttr
+    : CIR_AttrConstraint<"::cir::GlobalViewAttr", "GlobalView attribute">;
 
-def CIR_AnyIntOrGlobalViewAttr : AnyAttrOf<[CIR_AnyIntAttr, CIR_AnyGlobalViewAttr],
-    "integer or global view attribute"> {
+def CIR_AnyIntOrGlobalViewAttr
+    : AnyAttrOf<[CIR_AnyIntAttr, CIR_AnyGlobalViewAttr],
+                "integer or global view attribute"> {
+  string cppType = "::mlir::TypedAttr";
+}
+
+def CIR_TryHandlerAttr
+    : AnyAttrOf<[CIR_AnyGlobalViewAttr, CIR_AnyCatchAllAttr, CIR_AnyUnwindAttr],
+                "catch all or unwind or global view attribute"> {
   string cppType = "::mlir::TypedAttr";
 }
 
@@ -61,4 +79,7 @@ def CIR_IntOrGlobalViewArrayAttr : TypedArrayAttrBase<CIR_AnyIntOrGlobalViewAttr
   string cppType = "::mlir::ArrayAttr";
 }
 
-#endif // CLANG_CIR_DIALECT_IR_CIRATTRCONSTRAINTS_TD
-\ No newline at end of file
+def CIR_TryHandlerArrayAttr : TypedArrayAttrBase<CIR_TryHandlerAttr,
+                         "catch all or unwind or global view array attribute">;
+
+#endif // CLANG_CIR_DIALECT_IR_CIRATTRCONSTRAINTS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 610e349..69dbad3 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -968,4 +968,19 @@ def CIR_TypeInfoAttr : CIR_Attr<"TypeInfo", "typeinfo", [TypedAttrInterface]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// CatchAllAttr & UnwindAttr
+//===----------------------------------------------------------------------===//
+
+// Represents the catch_all region.
+def CIR_CatchAllAttr : CIR_UnitAttr<"CatchAll", "all"> {
+  let storageType = [{ CatchAllAttr }];
+}
+
+// Represents the unwind region where unwind continues or
+// the program std::terminate's.
+def CIR_UnwindAttr : CIR_UnitAttr<"Unwind", "unwind"> {
+  let storageType = [{ CatchUnwind }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index baab156..3988a6d 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -644,7 +644,7 @@ def CIR_StoreOp : CIR_Op<"store", [
 
 defvar CIR_ReturnableScopes = [
   "FuncOp", "ScopeOp", "IfOp", "SwitchOp", "CaseOp",
-  "DoWhileOp", "WhileOp", "ForOp"
+  "DoWhileOp", "WhileOp", "ForOp", "TryOp"
 ];
 
 def CIR_ReturnOp : CIR_Op<"return", [
@@ -791,7 +791,7 @@ def CIR_ConditionOp : CIR_Op<"condition", [
 
 defvar CIR_YieldableScopes = [
   "ArrayCtor", "ArrayDtor", "CaseOp", "DoWhileOp", "ForOp", "GlobalOp", "IfOp",
-  "ScopeOp", "SwitchOp", "TernaryOp", "WhileOp"
+  "ScopeOp", "SwitchOp", "TernaryOp", "WhileOp", "TryOp"
 ];
 
 def CIR_YieldOp : CIR_Op<"yield", [
@@ -4045,6 +4045,43 @@ def CIR_ExpectOp : CIR_Op<"expect", [
 }
 
 //===----------------------------------------------------------------------===//
+// PtrDiffOp
+//===----------------------------------------------------------------------===//
+
+def CIR_PtrDiffOp : CIR_Op<"ptr_diff", [Pure, SameTypeOperands]> {
+  let summary = "Pointer subtraction arithmetic";
+  let description = [{
+    The cir.ptr_diff operation computes the difference between two pointers that
+    have the same element type.
+
+    The result reflects the ABI-defined size of the pointed-to type. For example,
+    subtracting two !cir.ptr<!u64i> values may yield 1, representing an 8-byte
+    difference. In contrast, for pointers to void or function types, a result of
+    8 corresponds to an 8-byte difference.
+
+    For pointers to types whose size are not aligned with the target data
+    layout, the size is generally rounded to the next power of 2 bits. For
+    example, subtracting two !cir.ptr<!s24i> values for the _BitInt(24) type may
+    yield 1, representing a 4-byte difference (as opposed to a 3-byte
+    difference).
+
+    Example:
+
+    ```mlir
+    %7 = cir.ptr_diff %0, %1 : !cir.ptr<!u64i> -> !u64i
+    ```
+  }];
+
+  let arguments = (ins CIR_PointerType:$lhs, CIR_PointerType:$rhs);
+  let results = (outs CIR_AnyFundamentalIntType:$result);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs  `:` qualified(type($lhs)) `->` qualified(type($result)) 
+    attr-dict
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // Floating Point Ops
 //===----------------------------------------------------------------------===//
 
@@ -4326,6 +4363,89 @@ def CIR_AllocExceptionOp : CIR_Op<"alloc.exception"> {
 }
 
 //===----------------------------------------------------------------------===//
+// TryOp
+//===----------------------------------------------------------------------===//
+
+def CIR_TryOp : CIR_Op<"try",[
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments
+]> {
+  let summary = "C++ try block";
+  let description = [{
+    Holds the lexical scope of `try {}`. Note that resources used on catch
+    clauses are usually allocated in the same parent as `cir.try`.
+
+    `synthetic`: use `cir.try` to represent try/catches not originally
+    present in the source code. For example, a synthetic `cir.try` region
+    is created around the constructor call when `operator new` is used
+    so that the memory allocated will be freed if the constructor throws
+    an exception.
+
+    `cleanup`: indicates that there are cleanups that must be performed
+    when exiting the try region via exception, even if the exception is not
+    caught.
+
+    Example:
+
+    ```mlir
+    cir.try {
+      cir.call exception @function() : () -> ()
+      cir.yield
+    } catch [type #cir.global_view<@_ZTIPf> : !cir.ptr<!u8i>] {
+      ...
+      cir.yield
+    } unwind {
+      cir.resume
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    UnitAttr:$synthetic,
+    UnitAttr:$cleanup,
+    CIR_TryHandlerArrayAttr:$handler_types
+  );
+
+  let regions = (region
+    AnyRegion:$try_region,
+    VariadicRegion<MinSizedRegion<1>>:$handler_regions
+  );
+
+  let assemblyFormat = [{
+    (`synthetic` $synthetic^)?
+    (`cleanup` $cleanup^)?
+    $try_region
+    custom<TryHandlerRegions>($handler_regions, $handler_types)
+    attr-dict
+  }];
+
+  let builders = [
+    OpBuilder<(ins
+      "llvm::function_ref<void(mlir::OpBuilder &, "
+        "mlir::Location)>":$tryBuilder,
+      "llvm::function_ref<void(mlir::OpBuilder &, mlir::Location, "
+        "mlir::OperationState &)>":$handlersBuilder),
+    [{
+      assert(tryBuilder && "expected builder callback for 'cir.try' body");
+      assert(handlersBuilder
+        && "expected builder callback for 'handlers' body");
+
+      OpBuilder::InsertionGuard guard($_builder);
+
+      // Try body region
+      mlir::Region *tryBodyRegion = $_state.addRegion();
+
+      // Create try body region and set insertion point
+      $_builder.createBlock(tryBodyRegion);
+      tryBuilder($_builder, $_state.location);
+      handlersBuilder($_builder, $_state.location, $_state);
+    }]>
+  ];
+
+  let hasLLVMLowering = false;
+}
+
+//===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 4fbae15..de3bc94 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -322,6 +322,7 @@ struct MissingFeatures {
   static bool invokeOp() { return false; }
   static bool labelOp() { return false; }
   static bool ptrDiffOp() { return false; }
+  static bool llvmLoweringPtrDiffConsidersPointee() { return false; }
   static bool ptrStrideOp() { return false; }
   static bool switchOp() { return false; }
   static bool throwOp() { return false; }
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index b83bbcd..ffae3b9 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -76,9 +76,6 @@ def SecurityAlpha : Package<"security">, ParentPackage<Alpha>;
 def CERT : Package<"cert">, ParentPackage<Security>;
 def ENV : Package<"env">, ParentPackage<CERT>;
 
-def CERTAlpha : Package<"cert">, ParentPackage<SecurityAlpha>;
-def POSAlpha : Package<"pos">, ParentPackage<CERTAlpha>;
-
 def Unix : Package<"unix">;
 def UnixAlpha : Package<"unix">, ParentPackage<Alpha>;
 def CString : Package<"cstring">, ParentPackage<Unix>;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index e403b3e..32c8f62 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -4712,8 +4712,8 @@ QualType ASTContext::getConstantMatrixType(QualType ElementTy, unsigned NumRows,
 
   assert(MatrixType::isValidElementType(ElementTy) &&
          "need a valid element type");
-  assert(ConstantMatrixType::isDimensionValid(NumRows) &&
-         ConstantMatrixType::isDimensionValid(NumColumns) &&
+  assert(NumRows > 0 && NumRows <= LangOpts.MaxMatrixDimension &&
+         NumColumns > 0 && NumColumns <= LangOpts.MaxMatrixDimension &&
          "need valid matrix dimensions");
   void *InsertPos = nullptr;
   if (ConstantMatrixType *MTP = MatrixTypes.FindNodeOrInsertPos(ID, InsertPos))
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index 641a3db..19b5576 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -132,8 +132,12 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
   Opts.NamedLoops = Std.isC2y();
 
   Opts.HLSL = Lang == Language::HLSL;
-  if (Opts.HLSL && Opts.IncludeDefaultHeader)
-    Includes.push_back("hlsl.h");
+  if (Opts.HLSL) {
+    if (Opts.IncludeDefaultHeader)
+      Includes.push_back("hlsl.h");
+    // Set maximum matrix dimension to 4 for HLSL
+    Opts.MaxMatrixDimension = 4;
+  }
 
   // Set OpenCL Version.
   Opts.OpenCL = Std.isOpenCL();
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index ef4973c..e71f10c 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -626,6 +626,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_Lunarlake:
   case CK_Pantherlake:
   case CK_Wildcatlake:
+  case CK_Novalake:
   case CK_Sierraforest:
   case CK_Grandridge:
   case CK_Graniterapids:
@@ -1615,6 +1616,7 @@ std::optional<unsigned> X86TargetInfo::getCPUCacheLineSize() const {
     case CK_Lunarlake:
     case CK_Pantherlake:
     case CK_Wildcatlake:
+    case CK_Novalake:
     case CK_Sierraforest:
     case CK_Grandridge:
     case CK_Graniterapids:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 84acc74..50d585d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -380,6 +380,16 @@ public:
                                       /*relative_layout=*/false);
   }
 
+  mlir::Value createDynCastToVoid(mlir::Location loc, mlir::Value src,
+                                  bool vtableUseRelativeLayout) {
+    // TODO(cir): consider address space here.
+    assert(!cir::MissingFeatures::addressSpace());
+    cir::PointerType destTy = getVoidPtrTy();
+    return cir::DynamicCastOp::create(
+        *this, loc, destTy, cir::DynamicCastKind::Ptr, src,
+        cir::DynamicCastInfoAttr{}, vtableUseRelativeLayout);
+  }
+
   Address createBaseClassAddr(mlir::Location loc, Address addr,
                               mlir::Type destType, unsigned offset,
                               bool assumeNotNull) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 4cfa91e..ea31871 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -463,7 +463,9 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitLibraryCall(*this, fd, e,
                            cgm.getBuiltinLibFunction(fd, builtinID));
 
-  cgm.errorNYI(e->getSourceRange(), "unimplemented builtin call");
+  cgm.errorNYI(e->getSourceRange(),
+               std::string("unimplemented builtin call: ") +
+                   getContext().BuiltinInfo.getName(builtinID));
   return getUndefRValue(e->getType());
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 637f9ef..138082b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1734,9 +1734,9 @@ mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) {
   // LLVM we shall take VLA's, division by element size, etc.
   //
   // See more in `EmitSub` in CGExprScalar.cpp.
-  assert(!cir::MissingFeatures::ptrDiffOp());
-  cgf.cgm.errorNYI("ptrdiff");
-  return {};
+  assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee());
+  return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.PtrDiffTy,
+                                ops.lhs, ops.rhs);
 }
 
 mlir::Value ScalarExprEmitter::emitShl(const BinOpInfo &ops) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 0d64c31..3c36f5c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1313,10 +1313,10 @@ public:
 
   mlir::Value emitCXXNewExpr(const CXXNewExpr *e);
 
-  void emitNewArrayInitializer(const CXXNewExpr *E, QualType ElementType,
-                               mlir::Type ElementTy, Address BeginPtr,
-                               mlir::Value NumElements,
-                               mlir::Value AllocSizeWithoutCookie);
+  void emitNewArrayInitializer(const CXXNewExpr *e, QualType elementType,
+                               mlir::Type elementTy, Address beginPtr,
+                               mlir::Value numElements,
+                               mlir::Value allocSizeWithoutCookie);
 
   RValue emitCXXOperatorMemberCallExpr(const CXXOperatorCallExpr *e,
                                        const CXXMethodDecl *md,
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index 1b85a53..d54d2e9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -1945,6 +1945,15 @@ static cir::FuncOp getItaniumDynamicCastFn(CIRGenFunction &cgf) {
   return cgf.cgm.createRuntimeFunction(FTy, "__dynamic_cast");
 }
 
+static Address emitDynamicCastToVoid(CIRGenFunction &cgf, mlir::Location loc,
+                                     QualType srcRecordTy, Address src) {
+  bool vtableUsesRelativeLayout =
+      cgf.cgm.getItaniumVTableContext().isRelativeLayout();
+  mlir::Value ptr = cgf.getBuilder().createDynCastToVoid(
+      loc, src.getPointer(), vtableUsesRelativeLayout);
+  return Address{ptr, src.getAlignment()};
+}
+
 static cir::DynamicCastInfoAttr emitDynamicCastInfo(CIRGenFunction &cgf,
                                                     mlir::Location loc,
                                                     QualType srcRecordTy,
@@ -1979,10 +1988,8 @@ mlir::Value CIRGenItaniumCXXABI::emitDynamicCast(CIRGenFunction &cgf,
   bool isCastToVoid = destRecordTy.isNull();
   assert((!isCastToVoid || !isRefCast) && "cannot cast to void reference");
 
-  if (isCastToVoid) {
-    cgm.errorNYI(loc, "emitDynamicCastToVoid");
-    return {};
-  }
+  if (isCastToVoid)
+    return emitDynamicCastToVoid(cgf, loc, srcRecordTy, src).getPointer();
 
   // If the destination is effectively final, the cast succeeds if and only
   // if the dynamic type of the pointer is exactly the destination type.
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 7af3dc1..0712de2 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -2915,6 +2915,130 @@ LogicalResult cir::TypeInfoAttr::verify(
 }
 
 //===----------------------------------------------------------------------===//
+// TryOp
+//===----------------------------------------------------------------------===//
+
+void cir::TryOp::getSuccessorRegions(
+    mlir::RegionBranchPoint point,
+    llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
+  // The `try` and the `catchers` region branch back to the parent operation.
+  if (!point.isParent()) {
+    regions.push_back(mlir::RegionSuccessor());
+    return;
+  }
+
+  regions.push_back(mlir::RegionSuccessor(&getTryRegion()));
+
+  // TODO(CIR): If we know a target function never throws a specific type, we
+  // can remove the catch handler.
+  for (mlir::Region &handlerRegion : this->getHandlerRegions())
+    regions.push_back(mlir::RegionSuccessor(&handlerRegion));
+}
+
+static void
+printTryHandlerRegions(mlir::OpAsmPrinter &printer, cir::TryOp op,
+                       mlir::MutableArrayRef<mlir::Region> handlerRegions,
+                       mlir::ArrayAttr handlerTypes) {
+  if (!handlerTypes)
+    return;
+
+  for (const auto [typeIdx, typeAttr] : llvm::enumerate(handlerTypes)) {
+    if (typeIdx)
+      printer << " ";
+
+    if (mlir::isa<cir::CatchAllAttr>(typeAttr)) {
+      printer << "catch all ";
+    } else if (mlir::isa<cir::UnwindAttr>(typeAttr)) {
+      printer << "unwind ";
+    } else {
+      printer << "catch [type ";
+      printer.printAttribute(typeAttr);
+      printer << "] ";
+    }
+
+    printer.printRegion(handlerRegions[typeIdx],
+                        /*printEntryBLockArgs=*/false,
+                        /*printBlockTerminators=*/true);
+  }
+}
+
+static mlir::ParseResult parseTryHandlerRegions(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<std::unique_ptr<mlir::Region>> &handlerRegions,
+    mlir::ArrayAttr &handlerTypes) {
+
+  auto parseCheckedCatcherRegion = [&]() -> mlir::ParseResult {
+    handlerRegions.emplace_back(new mlir::Region);
+
+    mlir::Region &currRegion = *handlerRegions.back();
+    mlir::SMLoc regionLoc = parser.getCurrentLocation();
+    if (parser.parseRegion(currRegion)) {
+      handlerRegions.clear();
+      return failure();
+    }
+
+    if (!currRegion.empty() && !(currRegion.back().mightHaveTerminator() &&
+                                 currRegion.back().getTerminator()))
+      return parser.emitError(
+          regionLoc, "blocks are expected to be explicitly terminated");
+
+    return success();
+  };
+
+  bool hasCatchAll = false;
+  llvm::SmallVector<mlir::Attribute, 4> catcherAttrs;
+  while (parser.parseOptionalKeyword("catch").succeeded()) {
+    bool hasLSquare = parser.parseOptionalLSquare().succeeded();
+
+    llvm::StringRef attrStr;
+    if (parser.parseOptionalKeyword(&attrStr, {"all", "type"}).failed())
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected 'all' or 'type' keyword");
+
+    bool isCatchAll = attrStr == "all";
+    if (isCatchAll) {
+      if (hasCatchAll)
+        return parser.emitError(parser.getCurrentLocation(),
+                                "can't have more than one catch all");
+      hasCatchAll = true;
+    }
+
+    mlir::Attribute exceptionRTTIAttr;
+    if (!isCatchAll && parser.parseAttribute(exceptionRTTIAttr).failed())
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected valid RTTI info attribute");
+
+    catcherAttrs.push_back(isCatchAll
+                               ? cir::CatchAllAttr::get(parser.getContext())
+                               : exceptionRTTIAttr);
+
+    if (hasLSquare && isCatchAll)
+      return parser.emitError(parser.getCurrentLocation(),
+                              "catch all dosen't need RTTI info attribute");
+
+    if (hasLSquare && parser.parseRSquare().failed())
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected `]` after RTTI info attribute");
+
+    if (parseCheckedCatcherRegion().failed())
+      return mlir::failure();
+  }
+
+  if (parser.parseOptionalKeyword("unwind").succeeded()) {
+    if (hasCatchAll)
+      return parser.emitError(parser.getCurrentLocation(),
+                              "unwind can't be used with catch all");
+
+    catcherAttrs.push_back(cir::UnwindAttr::get(parser.getContext()));
+    if (parseCheckedCatcherRegion().failed())
+      return mlir::failure();
+  }
+
+  handlerTypes = parser.getBuilder().getArrayAttr(catcherAttrs);
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
index 7d3c711..11ce2a8 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
@@ -92,7 +92,53 @@ static mlir::Value
 buildDynamicCastToVoidAfterNullCheck(cir::CIRBaseBuilderTy &builder,
                                      clang::ASTContext &astCtx,
                                      cir::DynamicCastOp op) {
-  llvm_unreachable("dynamic cast to void is NYI");
+  mlir::Location loc = op.getLoc();
+  bool vtableUsesRelativeLayout = op.getRelativeLayout();
+
+  // TODO(cir): consider address space in this function.
+  assert(!cir::MissingFeatures::addressSpace());
+
+  mlir::Type vtableElemTy;
+  uint64_t vtableElemAlign;
+  if (vtableUsesRelativeLayout) {
+    vtableElemTy = builder.getSIntNTy(32);
+    vtableElemAlign = 4;
+  } else {
+    const auto &targetInfo = astCtx.getTargetInfo();
+    auto ptrdiffTy = targetInfo.getPtrDiffType(clang::LangAS::Default);
+    bool ptrdiffTyIsSigned = clang::TargetInfo::isTypeSigned(ptrdiffTy);
+    uint64_t ptrdiffTyWidth = targetInfo.getTypeWidth(ptrdiffTy);
+
+    vtableElemTy = cir::IntType::get(builder.getContext(), ptrdiffTyWidth,
+                                     ptrdiffTyIsSigned);
+    vtableElemAlign =
+        llvm::divideCeil(targetInfo.getPointerAlign(clang::LangAS::Default), 8);
+  }
+
+  // Access vtable to get the offset from the given object to its containing
+  // complete object.
+  // TODO: Add a specialized operation to get the object offset?
+  auto vptrTy = cir::VPtrType::get(builder.getContext());
+  cir::PointerType vptrPtrTy = builder.getPointerTo(vptrTy);
+  auto vptrPtr =
+      cir::VTableGetVPtrOp::create(builder, loc, vptrPtrTy, op.getSrc());
+  mlir::Value vptr = builder.createLoad(loc, vptrPtr);
+  mlir::Value elementPtr =
+      builder.createBitcast(vptr, builder.getPointerTo(vtableElemTy));
+  mlir::Value minusTwo = builder.getSignedInt(loc, -2, 64);
+  auto offsetToTopSlotPtr = cir::PtrStrideOp::create(
+      builder, loc, builder.getPointerTo(vtableElemTy), elementPtr, minusTwo);
+  mlir::Value offsetToTop =
+      builder.createAlignedLoad(loc, offsetToTopSlotPtr, vtableElemAlign);
+
+  // Add the offset to the given pointer to get the cast result.
+  // Cast the input pointer to a uint8_t* to allow pointer arithmetic.
+  cir::PointerType u8PtrTy = builder.getPointerTo(builder.getUIntNTy(8));
+  mlir::Value srcBytePtr = builder.createBitcast(op.getSrc(), u8PtrTy);
+  auto dstBytePtr =
+      cir::PtrStrideOp::create(builder, loc, u8PtrTy, srcBytePtr, offsetToTop);
+  // Cast the result to a void*.
+  return builder.createBitcast(dstBytePtr, builder.getVoidPtrTy());
 }
 
 mlir::Value
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index e61b65f..1fc98ec 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1499,6 +1499,54 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+static uint64_t getTypeSize(mlir::Type type, mlir::Operation &op) {
+  mlir::DataLayout layout(op.getParentOfType<mlir::ModuleOp>());
+  // For LLVM purposes we treat void as u8.
+  if (isa<cir::VoidType>(type))
+    type = cir::IntType::get(type.getContext(), 8, /*isSigned=*/false);
+  return llvm::divideCeil(layout.getTypeSizeInBits(type), 8);
+}
+
+mlir::LogicalResult CIRToLLVMPtrDiffOpLowering::matchAndRewrite(
+    cir::PtrDiffOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  auto dstTy = mlir::cast<cir::IntType>(op.getType());
+  mlir::Type llvmDstTy = getTypeConverter()->convertType(dstTy);
+
+  auto lhs = rewriter.create<mlir::LLVM::PtrToIntOp>(op.getLoc(), llvmDstTy,
+                                                     adaptor.getLhs());
+  auto rhs = rewriter.create<mlir::LLVM::PtrToIntOp>(op.getLoc(), llvmDstTy,
+                                                     adaptor.getRhs());
+
+  auto diff =
+      rewriter.create<mlir::LLVM::SubOp>(op.getLoc(), llvmDstTy, lhs, rhs);
+
+  cir::PointerType ptrTy = op.getLhs().getType();
+  assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee());
+  uint64_t typeSize = getTypeSize(ptrTy.getPointee(), *op);
+
+  // Avoid silly division by 1.
+  mlir::Value resultVal = diff.getResult();
+  if (typeSize != 1) {
+    auto typeSizeVal = rewriter.create<mlir::LLVM::ConstantOp>(
+        op.getLoc(), llvmDstTy, typeSize);
+
+    if (dstTy.isUnsigned()) {
+      auto uDiv =
+          rewriter.create<mlir::LLVM::UDivOp>(op.getLoc(), diff, typeSizeVal);
+      uDiv.setIsExact(true);
+      resultVal = uDiv.getResult();
+    } else {
+      auto sDiv =
+          rewriter.create<mlir::LLVM::SDivOp>(op.getLoc(), diff, typeSizeVal);
+      sDiv.setIsExact(true);
+      resultVal = sDiv.getResult();
+    }
+  }
+  rewriter.replaceOp(op, resultVal);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMExpectOpLowering::matchAndRewrite(
     cir::ExpectOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9ee810c..92dba32 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4283,15 +4283,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getType()->getAs<VectorType>()->getElementType(), nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     llvm::Value *Result;
     if (BuiltinID == Builtin::BI__builtin_masked_load) {
-      Function *F =
-          CGM.getIntrinsic(Intrinsic::masked_load, {RetTy, Ptr->getType()});
-      Result =
-          Builder.CreateCall(F, {Ptr, AlignVal, Mask, PassThru}, "masked_load");
+      Result = Builder.CreateMaskedLoad(RetTy, Ptr, Align.getAsAlign(), Mask,
+                                        PassThru, "masked_load");
     } else {
       Function *F = CGM.getIntrinsic(Intrinsic::masked_expandload, {RetTy});
       Result =
@@ -4307,8 +4303,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Type *RetTy = CGM.getTypes().ConvertType(E->getType());
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getType()->getAs<VectorType>()->getElementType(), nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     llvm::Value *PassThru = llvm::PoisonValue::get(RetTy);
     if (E->getNumArgs() > 3)
@@ -4318,12 +4312,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         E->getType()->getAs<VectorType>()->getElementType());
     llvm::Value *PtrVec = Builder.CreateGEP(ElemTy, Ptr, Idx);
 
-    llvm::Value *Result;
-    Function *F =
-        CGM.getIntrinsic(Intrinsic::masked_gather, {RetTy, PtrVec->getType()});
-
-    Result = Builder.CreateCall(F, {PtrVec, AlignVal, Mask, PassThru},
-                                "masked_gather");
+    llvm::Value *Result = Builder.CreateMaskedGather(
+        RetTy, PtrVec, Align.getAsAlign(), Mask, PassThru, "masked_gather");
     return RValue::get(Result);
   }
   case Builtin::BI__builtin_masked_store:
@@ -4338,13 +4328,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getArg(1)->getType()->getAs<VectorType>()->getElementType(),
         nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     if (BuiltinID == Builtin::BI__builtin_masked_store) {
-      llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::masked_store,
-                                           {ValLLTy, Ptr->getType()});
-      Builder.CreateCall(F, {Val, Ptr, AlignVal, Mask});
+      Builder.CreateMaskedStore(Val, Ptr, Align.getAsAlign(), Mask);
     } else {
       llvm::Function *F =
           CGM.getIntrinsic(llvm::Intrinsic::masked_compressstore, {ValLLTy});
@@ -4361,17 +4347,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getArg(2)->getType()->getAs<VectorType>()->getElementType(),
         nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     llvm::Type *ElemTy = CGM.getTypes().ConvertType(
         E->getArg(1)->getType()->getAs<VectorType>()->getElementType());
     llvm::Value *PtrVec = Builder.CreateGEP(ElemTy, Ptr, Idx);
 
-    Function *F = CGM.getIntrinsic(Intrinsic::masked_scatter,
-                                   {Val->getType(), PtrVec->getType()});
-
-    Builder.CreateCall(F, {Val, PtrVec, AlignVal, Mask});
+    Builder.CreateMaskedScatter(Val, PtrVec, Align.getAsAlign(), Mask);
     return RValue();
   }
   case Builtin::BI__builtin_isinf_sign: {
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index fb65bf9..3681cca 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1083,17 +1083,15 @@ _mm512_broadcast_f32x2(__m128 __A) {
                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                              (__v16sf)_mm512_broadcast_f32x2(__A),
                                              (__v16sf)__O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                              (__v16sf)_mm512_broadcast_f32x2(__A),
                                              (__v16sf)_mm512_setzero_ps());
@@ -1106,17 +1104,15 @@ _mm512_broadcast_f32x8(__m256 __A) {
                                          0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x8(__A),
                                            (__v16sf)__O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x8(__A),
                                            (__v16sf)_mm512_setzero_ps());
@@ -1128,17 +1124,15 @@ _mm512_broadcast_f64x2(__m128d __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
                                             (__v8df)_mm512_broadcast_f64x2(__A),
                                             (__v8df)__O);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
                                             (__v8df)_mm512_broadcast_f64x2(__A),
                                             (__v8df)_mm512_setzero_pd());
@@ -1151,17 +1145,15 @@ _mm512_broadcast_i32x2(__m128i __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                              (__v16si)_mm512_broadcast_i32x2(__A),
                                              (__v16si)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                              (__v16si)_mm512_broadcast_i32x2(__A),
                                              (__v16si)_mm512_setzero_si512());
@@ -1174,17 +1166,15 @@ _mm512_broadcast_i32x8(__m256i __A) {
                                           0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x8(__A),
                                            (__v16si)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x8(__A),
                                            (__v16si)_mm512_setzero_si512());
@@ -1196,17 +1186,15 @@ _mm512_broadcast_i64x2(__m128i __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
                                             (__v8di)_mm512_broadcast_i64x2(__A),
                                             (__v8di)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
                                             (__v8di)_mm512_broadcast_i64x2(__A),
                                             (__v8di)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 80e5842..07de036 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -225,17 +225,15 @@ _mm512_broadcastd_epi32(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512(__M,
                                              (__v16si) _mm512_broadcastd_epi32(__A),
                                              (__v16si) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512(__M,
                                              (__v16si) _mm512_broadcastd_epi32(__A),
                                              (__v16si) _mm512_setzero_si512());
@@ -247,18 +245,14 @@ _mm512_broadcastq_epi64(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                             (__v8di) _mm512_broadcastq_epi64(__A),
-                                             (__v8di) __O);
-
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_broadcastq_epi64(__A), (__v8di)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectq_512(__M,
                                              (__v8di) _mm512_broadcastq_epi64(__A),
                                              (__v8di) _mm512_setzero_si512());
@@ -321,9 +315,8 @@ _mm512_set1_epi32(int __s)
     __s, __s, __s, __s, __s, __s, __s, __s };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A) {
   return (__m512i)__builtin_ia32_selectd_512(__M,
                                              (__v16si)_mm512_set1_epi32(__A),
                                              (__v16si)_mm512_setzero_si512());
@@ -335,9 +328,8 @@ _mm512_set1_epi64(long long __d)
   return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return (__m512i)__builtin_ia32_selectq_512(__M,
                                              (__v8di)_mm512_set1_epi64(__A),
                                              (__v8di)_mm512_setzero_si512());
@@ -6552,17 +6544,15 @@ _mm512_broadcast_f32x4(__m128 __A) {
                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x4(__A),
                                            (__v16sf)__O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x4(__A),
                                            (__v16sf)_mm512_setzero_ps());
@@ -6597,17 +6587,15 @@ _mm512_broadcast_i32x4(__m128i __A) {
                                           0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x4(__A),
                                            (__v16si)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x4(__A),
                                            (__v16si)_mm512_setzero_si512());
@@ -6635,33 +6623,29 @@ _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512(__M,
                                               (__v8df) _mm512_broadcastsd_pd(__A),
                                               (__v8df) __O);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512(__M,
                                               (__v8df) _mm512_broadcastsd_pd(__A),
                                               (__v8df) _mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512(__M,
                                              (__v16sf) _mm512_broadcastss_ps(__A),
                                              (__v16sf) __O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512(__M,
                                              (__v16sf) _mm512_broadcastss_ps(__A),
                                              (__v16sf) _mm512_setzero_ps());
@@ -8381,17 +8365,15 @@ _mm512_movehdup_ps (__m512 __A)
                          1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_movehdup_ps(__A),
                                              (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_movehdup_ps(__A),
                                              (__v16sf)_mm512_setzero_ps());
@@ -8404,44 +8386,38 @@ _mm512_moveldup_ps (__m512 __A)
                          0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_moveldup_ps(__A),
                                              (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_moveldup_ps(__A),
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
                                      _mm_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
                                      _mm_setzero_pd());
 }
@@ -8884,17 +8860,15 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
 }
 #endif
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) {
   return (__m512i) __builtin_ia32_selectd_512(__M,
                                               (__v16si) _mm512_set1_epi32(__A),
                                               (__v16si) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, long long __A) {
   return (__m512i) __builtin_ia32_selectq_512(__M,
                                               (__v8di) _mm512_set1_epi64(__A),
                                               (__v8di) __O);
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 68bd52e..ee7974e 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -968,17 +968,15 @@ _mm256_broadcast_f32x2(__m128 __A) {
                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                              (__v8sf)_mm256_broadcast_f32x2(__A),
                                              (__v8sf)__O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                              (__v8sf)_mm256_broadcast_f32x2(__A),
                                              (__v8sf)_mm256_setzero_ps());
@@ -990,17 +988,15 @@ _mm256_broadcast_f64x2(__m128d __A) {
                                           0, 1, 0, 1);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
                                             (__v4df)_mm256_broadcast_f64x2(__A),
                                             (__v4df)__O);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
                                             (__v4df)_mm256_broadcast_f64x2(__A),
                                             (__v4df)_mm256_setzero_pd());
@@ -1012,17 +1008,15 @@ _mm_broadcast_i32x2(__m128i __A) {
                                           0, 1, 0, 1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm_broadcast_i32x2(__A),
                                              (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm_broadcast_i32x2(__A),
                                              (__v4si)_mm_setzero_si128());
@@ -1034,17 +1028,15 @@ _mm256_broadcast_i32x2(__m128i __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                              (__v8si)_mm256_broadcast_i32x2(__A),
                                              (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                              (__v8si)_mm256_broadcast_i32x2(__A),
                                              (__v8si)_mm256_setzero_si256());
@@ -1056,17 +1048,15 @@ _mm256_broadcast_i64x2(__m128i __A) {
                                           0, 1, 0, 1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
                                             (__v4di)_mm256_broadcast_i64x2(__A),
                                             (__v4di)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
                                             (__v4di)_mm256_broadcast_i64x2(__A),
                                             (__v4di)_mm256_setzero_si256());
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 965741f..676b5a0 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -5101,69 +5101,55 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
                                               (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
-{
-   return (__m128i)__builtin_ia32_selectd_128(__M,
-                                              (__v4si) _mm_set1_epi32(__A),
-                                              (__v4si)__O);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) {
+  return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si)_mm_set1_epi32(__A),
+                                             (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi32( __mmask8 __M, int __A)
-{
-   return (__m128i)__builtin_ia32_selectd_128(__M,
-                                              (__v4si) _mm_set1_epi32(__A),
-                                              (__v4si)_mm_setzero_si128());
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_set1_epi32(__mmask8 __M, int __A) {
+  return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si)_mm_set1_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
-{
-   return (__m256i)__builtin_ia32_selectd_256(__M,
-                                              (__v8si) _mm256_set1_epi32(__A),
-                                              (__v8si)__O);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __M, (__v8si)_mm256_set1_epi32(__A), (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
-{
-   return (__m256i)__builtin_ia32_selectd_256(__M,
-                                              (__v8si) _mm256_set1_epi32(__A),
-                                              (__v8si)_mm256_setzero_si256());
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_set1_epi32(__mmask8 __M, int __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __M, (__v8si)_mm256_set1_epi32(__A), (__v8si)_mm256_setzero_si256());
 }
 
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   return (__m128i) __builtin_ia32_selectq_128(__M,
                                               (__v2di) _mm_set1_epi64x(__A),
                                               (__v2di) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return (__m128i) __builtin_ia32_selectq_128(__M,
                                               (__v2di) _mm_set1_epi64x(__A),
                                               (__v2di) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
   return (__m256i) __builtin_ia32_selectq_256(__M,
                                               (__v4di) _mm256_set1_epi64x(__A),
                                               (__v4di) __O) ;
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
-   return (__m256i) __builtin_ia32_selectq_256(__M,
-                                               (__v4di) _mm256_set1_epi64x(__A),
-                                               (__v4di) _mm256_setzero_si256());
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      __M, (__v4di)_mm256_set1_epi64x(__A), (__v4di)_mm256_setzero_si256());
 }
 
 #define _mm_fixupimm_pd(A, B, C, imm) \
@@ -5610,130 +5596,113 @@ _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
            (__mmask8) __U);
 }
 
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpackhi_pd(__A, __B),
                                               (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpackhi_pd(__A, __B),
                                               (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpackhi_pd(__A, __B),
                                            (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpackhi_pd(__A, __B),
                                            (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpackhi_ps(__A, __B),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpackhi_ps(__A, __B),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpackhi_ps(__A, __B),
                                            (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpackhi_ps(__A, __B),
                                            (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpacklo_pd(__A, __B),
                                               (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpacklo_pd(__A, __B),
                                               (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpacklo_pd(__A, __B),
                                            (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpacklo_pd(__A, __B),
                                            (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpacklo_ps(__A, __B),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpacklo_ps(__A, __B),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpacklo_ps(__A, __B),
                                            (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpacklo_ps(__A, __B),
                                            (__v8sf)_mm256_setzero_ps());
@@ -6055,129 +6024,117 @@ _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
                                        _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpackhi_epi32(__A, __B),
                                            (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpackhi_epi32(__A, __B),
                                            (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpackhi_epi32(__A, __B),
                                         (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpackhi_epi32(__A, __B),
                                         (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpackhi_epi64(__A, __B),
                                            (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpackhi_epi64(__A, __B),
                                            (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpackhi_epi64(__A, __B),
                                         (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpackhi_epi64(__A, __B),
                                         (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpacklo_epi32(__A, __B),
                                            (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpacklo_epi32(__A, __B),
                                            (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpacklo_epi32(__A, __B),
                                         (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpacklo_epi32(__A, __B),
                                         (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpacklo_epi64(__A, __B),
                                            (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpacklo_epi64(__A, __B),
                                            (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpacklo_epi64(__A, __B),
                                         (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpacklo_epi64(__A, __B),
                                         (__v4di)_mm256_setzero_si256());
@@ -6594,17 +6551,15 @@ _mm256_broadcast_f32x4(__m128 __A) {
                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                             (__v8sf)_mm256_broadcast_f32x4(__A),
                                             (__v8sf)__O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                             (__v8sf)_mm256_broadcast_f32x4(__A),
                                             (__v8sf)_mm256_setzero_ps());
@@ -6616,129 +6571,113 @@ _mm256_broadcast_i32x4(__m128i __A) {
                                           0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                             (__v8si)_mm256_broadcast_i32x4(__A),
                                             (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                             (__v8si)_mm256_broadcast_i32x4(__A),
                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256(__M,
                                               (__v4df) _mm256_broadcastsd_pd(__A),
                                               (__v4df) __O);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256(__M,
                                               (__v4df) _mm256_broadcastsd_pd(__A),
                                               (__v4df) _mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128(__M,
                                              (__v4sf) _mm_broadcastss_ps(__A),
                                              (__v4sf) __O);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128(__M,
                                              (__v4sf) _mm_broadcastss_ps(__A),
                                              (__v4sf) _mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256(__M,
                                              (__v8sf) _mm256_broadcastss_ps(__A),
                                              (__v8sf) __O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256(__M,
                                              (__v8sf) _mm256_broadcastss_ps(__A),
                                              (__v8sf) _mm256_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128(__M,
                                              (__v4si) _mm_broadcastd_epi32(__A),
                                              (__v4si) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128(__M,
                                              (__v4si) _mm_broadcastd_epi32(__A),
                                              (__v4si) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256(__M,
                                              (__v8si) _mm256_broadcastd_epi32(__A),
                                              (__v8si) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256(__M,
                                              (__v8si) _mm256_broadcastd_epi32(__A),
                                              (__v8si) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectq_128(__M,
                                              (__v2di) _mm_broadcastq_epi64(__A),
                                              (__v2di) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectq_128(__M,
                                              (__v2di) _mm_broadcastq_epi64(__A),
                                              (__v2di) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256(__M,
                                              (__v4di) _mm256_broadcastq_epi64(__A),
                                              (__v4di) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256(__M,
                                              (__v4di) _mm256_broadcastq_epi64(__A),
                                              (__v4di) _mm256_setzero_si256());
@@ -8003,65 +7942,57 @@ _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
                                  (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
                                  (__v4di)_mm256_setzero_si256()))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_movehdup_ps(__A),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_movehdup_ps(__A),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_movehdup_ps(__A),
                                              (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_movehdup_ps(__A),
                                              (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_moveldup_ps(__A),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_moveldup_ps(__A),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_moveldup_ps(__A),
                                              (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_moveldup_ps(__A),
                                              (__v8sf)_mm256_setzero_ps());
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index e118dda..f28a037 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -159,7 +159,8 @@ void HLSLExternalSemaSource::defineHLSLMatrixAlias() {
                                                   SourceLocation(), ColsParam));
   TemplateParams.emplace_back(ColsParam);
 
-  const unsigned MaxMatDim = 4;
+  const unsigned MaxMatDim = SemaPtr->getLangOpts().MaxMatrixDimension;
+
   auto *MaxRow = IntegerLiteral::Create(
       AST, llvm::APInt(AST.getIntWidth(AST.IntTy), MaxMatDim), AST.IntTy,
       SourceLocation());
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index ddf17d8..5360f8a 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2927,6 +2927,8 @@ bool CastOperation::CheckHLSLCStyleCast(CheckedConversionKind CCK) {
       SrcExpr = Self.ImpCastExprToType(
           SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy),
           CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK);
+    else
+      SrcExpr = Self.DefaultLvalueConversion(SrcExpr.get());
     Kind = CK_HLSLElementwiseCast;
     return true;
   }
@@ -2935,6 +2937,7 @@ bool CastOperation::CheckHLSLCStyleCast(CheckedConversionKind CCK) {
   // If the relative order of this and the HLSLElementWise cast checks
   // are changed, it might change which cast handles what in a few cases
   if (Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) {
+    SrcExpr = Self.DefaultLvalueConversion(SrcExpr.get());
     const VectorType *VT = SrcTy->getAs<VectorType>();
     // change splat from vec1 case to splat from scalar
     if (VT && VT->getNumElements() == 1)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 4f409ca..652527a 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16239,9 +16239,9 @@ getAndVerifyMatrixDimension(Expr *Expr, StringRef Name, Sema &S) {
     return {};
   }
   uint64_t Dim = Value->getZExtValue();
-  if (!ConstantMatrixType::isDimensionValid(Dim)) {
+  if (Dim == 0 || Dim > S.Context.getLangOpts().MaxMatrixDimension) {
     S.Diag(Expr->getBeginLoc(), diag::err_builtin_matrix_invalid_dimension)
-        << Name << ConstantMatrixType::getMaxElementsPerDimension();
+        << Name << S.Context.getLangOpts().MaxMatrixDimension;
     return {};
   }
   return Dim;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 468bc1d..4ace03d 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -765,10 +765,18 @@ static bool isRelevantAttr(Sema &S, const Decl *D, const Attr *A) {
 
 static void instantiateDependentHLSLParamModifierAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
-    const HLSLParamModifierAttr *Attr, Decl *New) {
-  ParmVarDecl *P = cast<ParmVarDecl>(New);
-  P->addAttr(Attr->clone(S.getASTContext()));
-  P->setType(S.HLSL().getInoutParameterType(P->getType()));
+    const HLSLParamModifierAttr *Attr, const Decl *Old, Decl *New) {
+  ParmVarDecl *NewParm = cast<ParmVarDecl>(New);
+  NewParm->addAttr(Attr->clone(S.getASTContext()));
+
+  const Type *OldParmTy = cast<ParmVarDecl>(Old)->getType().getTypePtr();
+  if (OldParmTy->isDependentType() && Attr->isAnyOut())
+    NewParm->setType(S.HLSL().getInoutParameterType(NewParm->getType()));
+
+  assert(
+      (!Attr->isAnyOut() || (NewParm->getType().isRestrictQualified() &&
+                             NewParm->getType()->isReferenceType())) &&
+      "out or inout parameter type must be a reference and restrict qualified");
 }
 
 void Sema::InstantiateAttrsForDecl(
@@ -923,7 +931,7 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
 
     if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
       instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
-                                                New);
+                                                Tmpl, New);
       continue;
     }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 638904d..7c1fb12 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2517,12 +2517,18 @@ QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,
     Diag(AttrLoc, diag::err_attribute_zero_size) << "matrix" << ColRange;
     return QualType();
   }
-  if (!ConstantMatrixType::isDimensionValid(MatrixRows)) {
+  if (MatrixRows > Context.getLangOpts().MaxMatrixDimension &&
+      MatrixColumns > Context.getLangOpts().MaxMatrixDimension) {
+    Diag(AttrLoc, diag::err_attribute_size_too_large)
+        << RowRange << ColRange << "matrix row and column";
+    return QualType();
+  }
+  if (MatrixRows > Context.getLangOpts().MaxMatrixDimension) {
     Diag(AttrLoc, diag::err_attribute_size_too_large)
         << RowRange << "matrix row";
     return QualType();
   }
-  if (!ConstantMatrixType::isDimensionValid(MatrixColumns)) {
+  if (MatrixColumns > Context.getLangOpts().MaxMatrixDimension) {
     Diag(AttrLoc, diag::err_attribute_size_too_large)
         << ColRange << "matrix column";
     return QualType();
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 419d263..84adbf3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -173,6 +173,14 @@ bool tryToFindPtrOrigin(
           if (isSingleton(E->getFoundDecl()))
             return callback(E, true);
         }
+
+        if (auto *MemberExpr = dyn_cast<CXXDependentScopeMemberExpr>(CalleeE)) {
+          auto *Base = MemberExpr->getBase();
+          auto MemberName = MemberExpr->getMember().getAsString();
+          bool IsGetter = MemberName == "get" || MemberName == "ptr";
+          if (Base && isSafePtrType(Base->getType()) && IsGetter)
+            return callback(E, true);
+        }
       }
 
       // Sometimes, canonical type erroneously turns Ref<T> into T.
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index 4f231ee..8bef24f 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -596,6 +596,35 @@ void foo() {
 
 } // namespace sel_string
 
+namespace template_function {
+
+class Base {
+public:
+    virtual ~Base() = default;
+    void send(dispatch_queue_t) const;
+    void ref() const;
+    void deref() const;
+};
+
+template<typename Traits>
+class Derived : public Base {
+public:
+    virtual ~Derived() = default;
+
+    void send(typename Traits::MessageType) const;
+
+    virtual OSObjectPtr<dispatch_queue_t> msg(typename Traits::MessageType) const = 0;
+};
+
+template<typename Traits>
+void Derived<Traits>::send(typename Traits::MessageType messageType) const
+{
+    OSObjectPtr dictionary = msg(messageType);
+    Base::send(dictionary.get());
+}
+
+} // namespace template_function
+
 @interface TestObject : NSObject
 - (void)doWork:(NSString *)msg, ...;
 - (void)doWorkOnSelf;
diff --git a/clang/test/Analysis/zero-size-non-pod-array.cpp b/clang/test/Analysis/zero-size-non-pod-array.cpp
index 628be0d7..8a32a49 100644
--- a/clang/test/Analysis/zero-size-non-pod-array.cpp
+++ b/clang/test/Analysis/zero-size-non-pod-array.cpp
@@ -95,7 +95,7 @@ void zeroSizeArrayBinding() {
     // Note: This is an error in gcc but a warning in clang.
     // In MSVC the declaration of 'S arr[0]' is already an error
     // and it doesn't recognize this syntax as a structured binding.
-    auto [] = arr; //expected-warning{{ISO C++17 does not allow a decomposition group to be empty}}
+    auto [] = arr; //expected-warning{{ISO C++17 does not allow a structured binding group to be empty}}
 
     clang_analyzer_eval(S::CtorInvocationCount == 0); //expected-warning{{TRUE}}
 }
diff --git a/clang/test/CIR/CodeGen/dynamic-cast.cpp b/clang/test/CIR/CodeGen/dynamic-cast.cpp
index b493840..5d010d2 100644
--- a/clang/test/CIR/CodeGen/dynamic-cast.cpp
+++ b/clang/test/CIR/CodeGen/dynamic-cast.cpp
@@ -101,3 +101,59 @@ Derived &ref_cast(Base &b) {
 // OGCG:   br i1 %[[IS_NULL]], label %[[BAD_CAST:.*]], label %[[DONE:.*]]
 // OGCG: [[BAD_CAST]]:
 // OGCG:   call void @__cxa_bad_cast()
+
+void *ptr_cast_to_complete(Base *ptr) {
+  return dynamic_cast<void *>(ptr);
+}
+
+// CIR-BEFORE: cir.func dso_local @_Z20ptr_cast_to_completeP4Base
+// CIR-BEFORE:   %{{.+}} = cir.dyn_cast ptr %{{.+}} : !cir.ptr<!rec_Base> -> !cir.ptr<!void>
+// CIR-BEFORE: }
+
+//      CIR-AFTER: cir.func dso_local @_Z20ptr_cast_to_completeP4Base
+//      CIR-AFTER:   %[[SRC:.*]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!rec_Base>>, !cir.ptr<!rec_Base>
+// CIR-AFTER-NEXT:   %[[SRC_IS_NOT_NULL:.*]] = cir.cast ptr_to_bool %[[SRC]] : !cir.ptr<!rec_Base> -> !cir.bool
+// CIR-AFTER-NEXT:   %{{.+}} = cir.ternary(%[[SRC_IS_NOT_NULL]], true {
+// CIR-AFTER-NEXT:     %[[VPTR_PTR:.*]] = cir.vtable.get_vptr %[[SRC]] : !cir.ptr<!rec_Base> -> !cir.ptr<!cir.vptr>
+// CIR-AFTER-NEXT:     %[[VPTR:.*]] = cir.load %[[VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-AFTER-NEXT:     %[[ELEM_PTR:.*]] = cir.cast bitcast %[[VPTR]] : !cir.vptr -> !cir.ptr<!s64i>
+// CIR-AFTER-NEXT:     %[[MINUS_TWO:.*]] = cir.const #cir.int<-2> : !s64i
+// CIR-AFTER-NEXT:     %[[BASE_OFFSET_PTR:.*]] = cir.ptr_stride %[[ELEM_PTR]], %[[MINUS_TWO]] : (!cir.ptr<!s64i>, !s64i) -> !cir.ptr<!s64i>
+// CIR-AFTER-NEXT:     %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
+// CIR-AFTER-NEXT:     %[[SRC_BYTES_PTR:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_Base> -> !cir.ptr<!u8i>
+// CIR-AFTER-NEXT:     %[[DST_BYTES_PTR:.*]] = cir.ptr_stride %[[SRC_BYTES_PTR]], %[[BASE_OFFSET]] : (!cir.ptr<!u8i>, !s64i) -> !cir.ptr<!u8i>
+// CIR-AFTER-NEXT:     %[[CASTED_PTR:.*]] = cir.cast bitcast %[[DST_BYTES_PTR]] : !cir.ptr<!u8i> -> !cir.ptr<!void>
+// CIR-AFTER-NEXT:     cir.yield %[[CASTED_PTR]] : !cir.ptr<!void>
+// CIR-AFTER-NEXT:   }, false {
+// CIR-AFTER-NEXT:     %[[NULL_PTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void>
+// CIR-AFTER-NEXT:     cir.yield %[[NULL_PTR]] : !cir.ptr<!void>
+// CIR-AFTER-NEXT:   }) : (!cir.bool) -> !cir.ptr<!void>
+//      CIR-AFTER: }
+
+// LLVM: define {{.*}} @_Z20ptr_cast_to_completeP4Base
+// LLVM:   %[[IS_NOT_NULL:.*]] = icmp ne ptr %[[PTR:.*]], null
+// LLVM:   br i1 %[[IS_NOT_NULL]], label %[[NOT_NULL:.*]], label %[[NULL:.*]]
+// LLVM: [[NOT_NULL]]:
+// LLVM:   %[[VPTR:.*]] = load ptr, ptr %[[PTR]]
+// LLVM:   %[[BASE_OFFSET_PTR:.*]] = getelementptr i64, ptr %7, i64 -2
+// LLVM:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_PTR]]
+// LLVM:   %[[RESULT:.*]] = getelementptr i8, ptr %[[PTR]], i64 %[[BASE_OFFSET]]
+// LLVM:   br label %[[DONE:.*]]
+// LLVM: [[NULL]]:
+// LLVM:   br label %[[DONE]]
+// LLVM: [[DONE]]:
+// LLVM:   %[[RET:.*]] = phi ptr [ null, %[[NULL]] ], [ %[[RESULT]], %[[NOT_NULL]] ]
+
+// OGCG: define {{.*}} @_Z20ptr_cast_to_completeP4Base
+// OGCG:   %[[IS_NULL:.*]] = icmp eq ptr %[[PTR:.*]], null
+// OGCG:   br i1 %[[IS_NULL]], label %[[NULL:.*]], label %[[NOT_NULL:.*]]
+// OGCG: [[NOT_NULL]]:
+// OGCG:   %[[VPTR:.*]] = load ptr, ptr %[[PTR]]
+// OGCG:   %[[BASE_OFFSET_PTR:.*]] = getelementptr inbounds i64, ptr %[[VPTR]], i64 -2
+// OGCG:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_PTR]]
+// OGCG:   %[[RESULT:.*]] = getelementptr inbounds i8, ptr %[[PTR]], i64 %[[BASE_OFFSET]]
+// OGCG:   br label %[[DONE:.*]]
+// OGCG: [[NULL]]:
+// OGCG:   br label %[[DONE]]
+// OGCG: [[DONE]]:
+// OGCG:   %[[RET:.*]] = phi ptr [ %[[RESULT]], %[[NOT_NULL]] ], [ null, %[[NULL]] ]
diff --git a/clang/test/CIR/CodeGen/ptrdiff.c b/clang/test/CIR/CodeGen/ptrdiff.c
new file mode 100644
index 0000000..e6a754e
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ptrdiff.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+int addrcmp(const void* a, const void* b) {
+  // CIR-LABEL: addrcmp
+  // CIR: %[[R:.*]] = cir.ptr_diff
+  // CIR: cir.cast integral %[[R]] : !s64i -> !s32i
+
+  // LLVM-LABEL: define dso_local i32 @addrcmp(
+  // LLVM: %[[PTR_A:.*]] = ptrtoint ptr {{.*}} to i64
+  // LLVM: %[[PTR_B:.*]] = ptrtoint ptr {{.*}} to i64
+  // LLVM: %[[SUB:.*]] = sub i64 %[[PTR_A]], %[[PTR_B]]
+  // LLVM-NOT: sdiv
+  // LLVM: trunc i64 %[[SUB]] to i32
+
+  // OGCG-LABEL: define dso_local i32 @addrcmp(
+  // OGCG: %[[PTR_A:.*]] = ptrtoint ptr {{.*}} to i64
+  // OGCG: %[[PTR_B:.*]] = ptrtoint ptr {{.*}} to i64
+  // OGCG: %[[SUB:.*]] = sub i64 %[[PTR_A]], %[[PTR_B]]
+  // OGCG-NOT: sdiv
+  // OGCG: trunc i64 %[[SUB]] to i32
+  return *(const void**)a - *(const void**)b;
+}
+
+unsigned long long test_ptr_diff(int *a, int* b) {
+  // CIR-LABEL: test_ptr_diff
+  // CIR: %[[D:.*]] = cir.ptr_diff {{.*}} : !cir.ptr<!s32i> -> !s64i
+  // CIR: %[[U:.*]] = cir.cast integral %[[D]] : !s64i -> !u64i
+  // CIR: cir.return {{.*}} : !u64i
+
+  // LLVM-LABEL: define dso_local i64 @test_ptr_diff(
+  // LLVM: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // LLVM: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 4
+  // LLVM: store i64 %[[Q]], ptr %[[RETADDR:.*]], align
+  // LLVM: %[[RETLOAD:.*]] = load i64, ptr %[[RETADDR]], align
+  // LLVM: ret i64 %[[RETLOAD]]
+
+  // OGCG-LABEL: define dso_local i64 @test_ptr_diff(
+  // OGCG: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // OGCG: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 4
+  // OGCG: ret i64 %[[Q]]
+  return a - b;
+}
+\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/ptrdiff.cpp b/clang/test/CIR/CodeGen/ptrdiff.cpp
new file mode 100644
index 0000000..34ba0ff
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ptrdiff.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+typedef unsigned long size_type;
+
+size_type size(unsigned long *_start, unsigned long *_finish) {
+  // CIR-LABEL: cir.func dso_local @_Z4sizePmS_
+  // CIR: %[[D:.*]] = cir.ptr_diff {{.*}} : !cir.ptr<!u64i> -> !s64i
+  // CIR: %[[U:.*]] = cir.cast integral %[[D]] : !s64i -> !u64i
+  // CIR: cir.return {{.*}} : !u64i
+
+  // LLVM-LABEL: define dso_local {{.*}}i64 @_Z4sizePmS_(
+  // LLVM: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // LLVM: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 8
+  // LLVM: store i64 %[[Q]], ptr %[[RETADDR:.*]], align
+  // LLVM: %[[RET:.*]] = load i64, ptr %[[RETADDR]], align
+  // LLVM: ret i64 %[[RET]]
+
+  // OGCG-LABEL: define dso_local {{.*}}i64 @_Z4sizePmS_(
+  // OGCG: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // OGCG: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 8
+  // OGCG: ret i64 %[[Q]]
+
+  return static_cast<size_type>(_finish - _start);
+}
+\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/throws.cpp b/clang/test/CIR/CodeGen/throws.cpp
index 4255d43..89cb072 100644
--- a/clang/test/CIR/CodeGen/throws.cpp
+++ b/clang/test/CIR/CodeGen/throws.cpp
@@ -144,3 +144,55 @@ void throw_complex_expr() {
 // OGCG: store float 0x3FF19999A0000000, ptr %[[EXCEPTION_REAL]], align 16
 // OGCG: store float 0x40019999A0000000, ptr %[[EXCEPTION_IMAG]], align 4
 // OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTICf, ptr null)
+
+void throw_vector_type() {
+  typedef int vi4 __attribute__((vector_size(16)));
+  vi4 a;
+  throw a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[EXCEPTION_ADDR:.*]] = cir.alloc.exception 16 -> !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[EXCEPTION_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, @_ZTIDv4_i
+// CIR: cir.unreachable
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// LLVM: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+// OGCG: unreachable
+
+void throw_ext_vector_type() {
+  typedef int vi4 __attribute__((ext_vector_type(4)));
+  vi4 a;
+  throw a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[EXCEPTION_ADDR:.*]] = cir.alloc.exception 16 -> !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[EXCEPTION_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, @_ZTIDv4_i
+// CIR: cir.unreachable
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// LLVM: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+// OGCG: unreachable
diff --git a/clang/test/CIR/IR/invalid-try-catch.cir b/clang/test/CIR/IR/invalid-try-catch.cir
new file mode 100644
index 0000000..04a4d25
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-try-catch.cir
@@ -0,0 +1,156 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+module {
+
+cir.func dso_local @invalid_catch_without_all_or_type() {
+  cir.scope {
+    cir.try {
+      cir.yield
+     // expected-error @below {{'cir.try' expected 'all' or 'type' keyword}}
+    } catch [invalid_keyword {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_rtti_type() {
+  cir.scope {
+    // expected-error @below {{'cir.try' op attribute 'handler_types' failed to satisfy constraint: catch all or unwind or global view array attribute}}
+    cir.try {
+      cir.yield
+    } catch [type #cir.undef] {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_empty_block() {
+  cir.scope {
+    // expected-error @below {{'cir.try' op region #1 ('handler_regions') failed to verify constraint: region with at least 1 blocks}}
+    cir.try {
+      cir.yield
+    } catch all {
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module {
+
+cir.func dso_local @invalid_catch_not_terminated() {
+  %a = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+  cir.scope {
+    cir.try {
+      cir.yield
+    } 
+    // expected-error @below {{'cir.try' blocks are expected to be explicitly terminated}}
+    catch all {
+       %tmp_a = cir.load %a : !cir.ptr<!s32i>, !s32i
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_multiple_catch_all() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } catch all {
+      cir.yield
+    } 
+    // expected-error @below {{op 'cir.try' can't have more than one catch all}}
+    catch all {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_without_type_info() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } 
+    // expected-error @below {{expected attribute value}}
+    // expected-error @below {{op 'cir.try' expected valid RTTI info attribute}}
+    catch [type] {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_all_with_type_info() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    }
+    // expected-error @below {{op 'cir.try' catch all dosen't need RTTI info attribute}}
+    catch [all] {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_unwind_with_catch_all() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    }
+    catch all {
+      cir.yield
+    }
+    // expected-error @below {{op 'cir.try' unwind can't be used with catch all}}
+    unwind {
+
+    }
+  }
+  cir.return
+}
+
+}
diff --git a/clang/test/CIR/IR/try-catch.cir b/clang/test/CIR/IR/try-catch.cir
new file mode 100644
index 0000000..7becd0b
--- /dev/null
+++ b/clang/test/CIR/IR/try-catch.cir
@@ -0,0 +1,84 @@
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!u8i = !cir.int<u, 8>
+
+module {
+
+cir.global "private" constant external @_ZTIi : !cir.ptr<!u8i>
+cir.global "private" constant external @_ZTIPKc : !cir.ptr<!u8i>
+
+cir.func dso_local @empty_try_block_with_catch_all() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } catch all {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+// CHECK: cir.func dso_local @empty_try_block_with_catch_all() {
+// CHECK:   cir.scope {
+// CHECK:     cir.try {
+// CHECK:       cir.yield
+// CHECK:     } catch all {
+// CHECK:       cir.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   cir.return
+// CHECK: }
+
+cir.func dso_local @empty_try_block_with_catch_unwind() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } unwind {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+// CHECK: cir.func dso_local @empty_try_block_with_catch_unwind() {
+// CHECK:   cir.scope {
+// CHECK:     cir.try {
+// CHECK:       cir.yield
+// CHECK:     } unwind {
+// CHECK:       cir.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   cir.return
+// CHECK: }
+
+cir.func dso_local @empty_try_block_with_catch_ist() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } catch [type #cir.global_view<@_ZTIi> : !cir.ptr<!u8i>] {
+      cir.yield
+    } catch [type #cir.global_view<@_ZTIPKc> : !cir.ptr<!u8i>] {
+      cir.yield
+    } unwind {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+// CHECK: cir.func dso_local @empty_try_block_with_catch_ist() {
+// CHECK:   cir.scope {
+// CHECK:     cir.try {
+// CHECK:       cir.yield
+// CHECK:     } catch [type #cir.global_view<@_ZTIi> : !cir.ptr<!u8i>] {
+// CHECK:       cir.yield
+// CHECK:     } catch [type #cir.global_view<@_ZTIPKc> : !cir.ptr<!u8i>] {
+// CHECK:       cir.yield
+// CHECK:     } unwind {
+// CHECK:       cir.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   cir.return
+// CHECK: }
+
+}
diff --git a/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp b/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp
index cad7d8e..717ea3aa 100644
--- a/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp
@@ -16,8 +16,8 @@ int array() {
   using X3 = X[3];
   auto [a3, b3, c3] = X3{1, 2, 3};
 
-  auto &[d, e] = arr; // expected-error {{type 'int[3]' decomposes into 3 elements, but only 2 names were provided}}
-  auto &[f, g, h, i] = arr; // expected-error {{type 'int[3]' decomposes into 3 elements, but 4 names were provided}}
+  auto &[d, e] = arr; // expected-error {{type 'int[3]' binds to 3 elements, but only 2 names were provided}}
+  auto &[f, g, h, i] = arr; // expected-error {{type 'int[3]' binds to 3 elements, but 4 names were provided}}
 
   auto &[r0, r1, r2] = arr;
   const auto &[cr0, cr1, cr2] = arr;
diff --git a/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp b/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp
index ce5eefc..b7fef12 100644
--- a/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp
@@ -5,10 +5,10 @@ using size_t = decltype(sizeof(0));
 struct A { int x, y; };
 struct B { int x, y; };
 
-void no_tuple_size_1() { auto [x, y] = A(); } // ok, decompose elementwise
+void no_tuple_size_1() { auto [x, y] = A(); } // ok, bind elementwise
 
 namespace std { template<typename T> struct tuple_size; }
-void no_tuple_size_2() { auto [x, y] = A(); } // ok, decompose elementwise
+void no_tuple_size_2() { auto [x, y] = A(); } // ok, bind elementwise
 
 struct Bad1 { int a, b; };
 template<> struct std::tuple_size<Bad1> {};
@@ -16,15 +16,15 @@ void no_tuple_size_3() { auto [x, y] = Bad1(); } // ok, omitting value is valid
 
 struct Bad2 {};
 template<> struct std::tuple_size<Bad2> { const int value = 5; };
-void no_tuple_size_4() { auto [x, y] = Bad2(); } // expected-error {{cannot decompose this type; 'std::tuple_size<Bad2>::value' is not a valid integral constant expression}}
+void no_tuple_size_4() { auto [x, y] = Bad2(); } // expected-error {{cannot bind this type; 'std::tuple_size<Bad2>::value' is not a valid integral constant expression}}
 
 template<> struct std::tuple_size<A> { static const int value = 3; };
 template<> struct std::tuple_size<B> { enum { value = 3 }; };
 
 void no_get_1() {
   {
-    auto [a0, a1] = A(); // expected-error {{decomposes into 3 elements}}
-    auto [b0, b1] = B(); // expected-error {{decomposes into 3 elements}}
+    auto [a0, a1] = A(); // expected-error {{binds to 3 elements}}
+    auto [b0, b1] = B(); // expected-error {{binds to 3 elements}}
   }
   auto [a0, a1, a2] = A(); // expected-error {{undeclared identifier 'get'}} expected-note {{in implicit initialization of binding declaration 'a0'}}
 }
diff --git a/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp b/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp
index 7141124..532a967 100644
--- a/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp
@@ -25,10 +25,10 @@ namespace NonPublicMembers {
   struct NonPublic4 : NonPublic2 {};
 
   void test() {
-    auto [a1] = NonPublic1(); // expected-error {{cannot decompose protected member 'a' of 'NonPublicMembers::NonPublic1'}}
-    auto [a2] = NonPublic2(); // expected-error {{cannot decompose private member 'a' of 'NonPublicMembers::NonPublic2'}}
-    auto [a3] = NonPublic3(); // expected-error {{cannot decompose members of inaccessible base class 'A' of 'NonPublicMembers::NonPublic3'}}
-    auto [a4] = NonPublic4(); // expected-error {{cannot decompose private member 'a' of 'NonPublicMembers::NonPublic2'}}
+    auto [a1] = NonPublic1(); // expected-error {{cannot bind protected member 'a' of 'NonPublicMembers::NonPublic1'}}
+    auto [a2] = NonPublic2(); // expected-error {{cannot bind private member 'a' of 'NonPublicMembers::NonPublic2'}}
+    auto [a3] = NonPublic3(); // expected-error {{cannot bind members of inaccessible base class 'A' of 'NonPublicMembers::NonPublic3'}}
+    auto [a4] = NonPublic4(); // expected-error {{cannot bind private member 'a' of 'NonPublicMembers::NonPublic2'}}
   }
 }
 
@@ -46,8 +46,8 @@ namespace AnonymousMember {
   };
 
   void test() {
-    auto [a1] = Struct(); // expected-error {{cannot decompose class type 'Struct' because it has an anonymous struct member}}
-    auto [a2] = Union(); // expected-error {{cannot decompose class type 'Union' because it has an anonymous union member}}
+    auto [a1] = Struct(); // expected-error {{cannot bind class type 'Struct' because it has an anonymous struct member}}
+    auto [a2] = Union(); // expected-error {{cannot bind class type 'Union' because it has an anonymous union member}}
   }
 }
 
@@ -73,12 +73,12 @@ namespace MultipleClasses {
   struct M : virtual J, L {};
 
   void test() {
-    auto [b] = B(); // expected-error {{cannot decompose class type 'B': both it and its base class 'A' have non-static data members}}
-    auto [d] = D(); // expected-error {{cannot decompose class type 'D': its base classes 'A' and 'C' have non-static data members}}
+    auto [b] = B(); // expected-error {{cannot bind class type 'B': both it and its base class 'A' have non-static data members}}
+    auto [d] = D(); // expected-error {{cannot bind class type 'D': its base classes 'A' and 'C' have non-static data members}}
     auto [e] = E();
-    auto [f] = F(); // expected-error-re {{cannot decompose members of ambiguous base class 'A' of 'F':{{.*}}struct MultipleClasses::F -> A{{.*}}struct MultipleClasses::F -> E -> A}}
+    auto [f] = F(); // expected-error-re {{cannot bind members of ambiguous base class 'A' of 'F':{{.*}}struct MultipleClasses::F -> A{{.*}}struct MultipleClasses::F -> E -> A}}
     auto [h] = H(); // ok, only one (virtual) base subobject even though there are two paths to it
-    auto [k] = K(); // expected-error {{cannot decompose members of ambiguous base class 'I'}}
+    auto [k] = K(); // expected-error {{cannot bind members of ambiguous base class 'I'}}
     auto [m] = M(); // ok, all paths to I are through the same virtual base subobject J
 
     same<decltype(m), int>();
@@ -214,7 +214,7 @@ namespace p0969r0 {
     auto &[x, y] = b;
   }
   void test_external(B b) {
-    auto &[x, y] = b; // expected-error {{cannot decompose members of inaccessible base class 'A' of 'p0969r0::B'}}
+    auto &[x, y] = b; // expected-error {{cannot bind members of inaccessible base class 'A' of 'p0969r0::B'}}
   }
 
   struct C {
@@ -229,13 +229,13 @@ namespace p0969r0 {
   struct D : C {
     static void test_member(D d, C c) {
       auto &[x1, y1] = d;
-      auto &[x2, y2] = c; // expected-error {{cannot decompose protected member 'y' of 'p0969r0::C'}}
+      auto &[x2, y2] = c; // expected-error {{cannot bind protected member 'y' of 'p0969r0::C'}}
     }
   };
   void test_friend(D d) {
     auto &[x, y] = d;
   }
   void test_external(D d) {
-    auto &[x, y] = d; // expected-error {{cannot decompose protected member 'y' of 'p0969r0::C'}}
+    auto &[x, y] = d; // expected-error {{cannot bind protected member 'y' of 'p0969r0::C'}}
   }
 }
diff --git a/clang/test/CXX/drs/cwg22xx.cpp b/clang/test/CXX/drs/cwg22xx.cpp
index 8c8ad9f..34119a1 100644
--- a/clang/test/CXX/drs/cwg22xx.cpp
+++ b/clang/test/CXX/drs/cwg22xx.cpp
@@ -202,7 +202,7 @@ namespace cwg2285 { // cwg2285: 4
   void test() {
     using T = int[1];
     auto [a] = T{a};
-    // since-cxx17-error@-1 {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}
+    // since-cxx17-error@-1 {{binding 'a' cannot appear in the initializer of its own structured binding declaration}}
   }
 #endif
 } // namespace cwg2285
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index 1285665..72cf249 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -440,7 +440,7 @@ template <> struct tuple_size<cwg2386::Bad2> {
 namespace cwg2386 {
 void no_value() { auto [x, y] = Bad1(); }
 void wrong_value() { auto [x, y] = Bad2(); }
-// since-cxx17-error@-1 {{type 'Bad2' decomposes into 42 elements, but only 2 names were provided}}
+// since-cxx17-error@-1 {{type 'Bad2' binds to 42 elements, but only 2 names were provided}}
 #endif
 } // namespace cwg2386
 
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index bceef64..aa87f5a 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -183,17 +183,17 @@ T get_T();
 
 void use() {
   UnaryC auto [a, b] = get_S();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
   BinaryC<int> auto [c, d] = get_S();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
 }
 
 template<typename T>
 void TemplUse() {
   UnaryC auto [a, b] = get_T<T>();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
   BinaryC<T> auto [c, d] = get_T<T>();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
 }
 #endif
 } // namespace cwg2635
diff --git a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
index 93c2beb..938a3d0 100644
--- a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
+++ b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
@@ -75,7 +75,7 @@ namespace X {
   void test_D() {
 #if __cplusplus >= 201703L
     for (extern auto [x, y] : D()) {
-    } // expected-error@-1 {{decomposition declaration cannot be declared 'extern'}}
+    } // expected-error@-1 {{structured binding declaration cannot be declared 'extern'}}
       // expected-error@-2 {{loop variable '[x, y]' may not be declared 'extern'}}
 #endif
   }
diff --git a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
index 205df5f..beed7bf 100644
--- a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
+++ b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
@@ -165,7 +165,7 @@ A<0>::B a;
 
 template <typename T> int shadow() {  // expected-note{{template parameter is declared here}}
   using arr = int[1];
-  // expected-warning@+1 {{decomposition declarations are a C++17 extension}}
+  // expected-warning@+1 {{structured binding declarations are a C++17 extension}}
   auto [
     T // expected-error {{declaration of 'T' shadows template parameter}}
     ] = arr{};
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 4112561..1b09959 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1305,6 +1305,7 @@ __m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, float const* _
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_broadcast_f32x8(__O, __M, _mm256_loadu_ps(__A)); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xAAAA, (__m256)(__v8sf){5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f}), 0,5,0,5,0,5,0,5,0,5,0,5,0,5,0,5));
 
 __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_f32x8
@@ -1312,6 +1313,7 @@ __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_broadcast_f32x8(__M, _mm256_loadu_ps(__A)); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xAAAA, _mm256_set1_ps(7.0f)), 0,7,0,7,0,7,0,7,0,7,0,7,0,7,0,7));
 
 __m512d test_mm512_broadcast_f64x2(double const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_f64x2
@@ -1327,6 +1329,8 @@ __m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, double const*
   return _mm512_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xAA, (__m128d)(__v2df){1,2}), 0,2,0,2,0,2,0,2));
+
 __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1334,6 +1338,8 @@ __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   return _mm512_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xAA, (__m128d)(__v2df){1,2}), 0,2,0,2,0,2,0,2));
+
 __m512i test_mm512_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: test_mm512_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1348,6 +1354,8 @@ __m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A)
   return _mm512_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x2(_mm512_setzero_si512(), 0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1));
+
 __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1355,6 +1363,8 @@ __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
   return _mm512_maskz_broadcast_i32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x2(0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1));
+
 __m512i test_mm512_broadcast_i32x8(__m256i const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_i32x8
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1368,6 +1378,7 @@ __m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i cons
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_broadcast_i32x8(__O, __M, _mm256_loadu_si256(__A)); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xAAAA, _mm256_set1_epi32(8)), 0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8));
 
 __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_i32x8
@@ -1376,6 +1387,8 @@ __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) {
   return _mm512_maskz_broadcast_i32x8(__M, _mm256_loadu_si256(__A)); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xAAAA, _mm256_set1_epi32(9)), 0,9,0,9,0,9,0,9,0,9,0,9,0,9,0,9));
+
 __m512i test_mm512_broadcast_i64x2(__m128i const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 7756f0d..3deaf8e 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -6836,6 +6836,8 @@ __m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, float const* _
   return _mm512_mask_broadcast_f32x4(__O, __M, _mm_loadu_ps(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xAAAA, (__m128)(__v4sf){1,2,3,4}), 0,2,0,4,0,2,0,4,0,2,0,4,0,2,0,4));
+
 __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_f32x4
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -6843,6 +6845,8 @@ __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) {
   return _mm512_maskz_broadcast_f32x4(__M, _mm_loadu_ps(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xAAAA, (__m128)(__v4sf){1,2,3,4}), 0,2,0,4,0,2,0,4,0,2,0,4,0,2,0,4));
+
 __m512d test_mm512_broadcast_f64x4(double const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_f64x4
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -6885,6 +6889,8 @@ __m512i test_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i const* __A) {
   return _mm512_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,3,0,1,0,3,0,1,0,3,0,1,0,3));
+
 __m512i test_mm512_broadcast_i64x4(__m256i const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_i64x4
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -10903,6 +10909,8 @@ __m512i test_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
   return _mm512_mask_set1_epi32 ( __O, __M, __A);
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xAAAA, 13), 0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13));
+
 __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
 {     
   // CHECK-LABEL: test_mm512_maskz_set1_epi32
@@ -10926,6 +10934,8 @@ __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
     return _mm512_maskz_set1_epi32(__M, __A);
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_set1_epi32(0xAAAA, 19), 0,19,0,19,0,19,0,19,0,19,0,19,0,19,0,19));
+
 
 __m512i test_mm512_set_epi8(char e63, char e62, char e61, char e60, char e59,
     char e58, char e57, char e56, char e55, char e54, char e53, char e52,
@@ -11145,6 +11155,8 @@ __m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
   return _mm512_mask_set1_epi64 (__O, __M, __A);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xAA, 21), 0,21,0,21,0,21,0,21));
+
 __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
   // CHECK-LABEL: test_mm512_maskz_set1_epi64
@@ -11160,6 +11172,8 @@ __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
   return _mm512_maskz_set1_epi64 (__M, __A);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_set1_epi64(0xAA, 23), 0, 23, 0, 23, 0, 23, 0, 23));
+
 
 __m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C,
                               long long __D, long long __E, long long __F,
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 51385d5..9b6bfea9 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -7201,6 +7201,8 @@ __m128i test_mm_mask_set1_epi32(__m128i __O, __mmask8 __M) {
   return _mm_mask_set1_epi32(__O, __M, 5); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_set1_epi32(_mm_setzero_si128(), 0xF, 7), 7, 7, 7, 7));
+
 __m128i test_mm_maskz_set1_epi32(__mmask8 __M) {
   // CHECK-LABEL: test_mm_maskz_set1_epi32
   // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0
@@ -7212,6 +7214,8 @@ __m128i test_mm_maskz_set1_epi32(__mmask8 __M) {
   return _mm_maskz_set1_epi32(__M, 5); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_set1_epi32(0xA, 11), 0, 11, 0, 11));
+
 __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) {
   // CHECK-LABEL: test_mm256_mask_set1_epi32
   // CHECK:  insertelement <8 x i32> poison, i32 %{{.*}}, i32 0
@@ -7226,6 +7230,8 @@ __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) {
   return _mm256_mask_set1_epi32(__O, __M, 5); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xAA, 5), 0, 5, 0, 5, 0, 5, 0, 5));
+
 __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi32
   // CHECK:  insertelement <8 x i32> poison, i32 %{{.*}}, i32 0
@@ -7240,6 +7246,8 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
   return _mm256_maskz_set1_epi32(__M, 5); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_set1_epi32(0xAA, 13), 0, 13, 0, 13, 0, 13, 0, 13));
+
 __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm_mask_set1_epi64
   // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
@@ -7249,6 +7257,8 @@ __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   return _mm_mask_set1_epi64(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_mask_set1_epi64(_mm_setzero_si128(), 0x3, 9), 9, 9));
+
 __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm_maskz_set1_epi64
   // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
@@ -7258,6 +7268,8 @@ __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return _mm_maskz_set1_epi64(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_maskz_set1_epi64(0x2, 15), 0, 15));
+
 __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm256_mask_set1_epi64
   // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0
@@ -7269,6 +7281,8 @@ __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
   return _mm256_mask_set1_epi64(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_mask_set1_epi64(_mm256_setzero_si256(), 0xF, 11), 11, 11, 11, 11));
+
 __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi64
   // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0
@@ -7280,6 +7294,8 @@ __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return _mm256_maskz_set1_epi64(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_set1_epi64(0xA, 17), 0, 17, 0, 17));
+
 __m128d test_mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C) {
   // CHECK-LABEL: test_mm_fixupimm_pd
   // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.128
@@ -7623,6 +7639,8 @@ __m128d test_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d
   return _mm_mask_unpackhi_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m128d(_mm_mask_unpackhi_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 2.0,4.0));
+
 __m128d test_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
@@ -7637,6 +7655,8 @@ __m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2
   return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xAA, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0,6.0,0,8.0));
+
 __m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: test_mm256_maskz_unpackhi_pd
   // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -7679,6 +7699,8 @@ __m128d test_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d
   return _mm_mask_unpacklo_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m128d(_mm_mask_unpacklo_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 1.0,3.0));
+
 __m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: test_mm_maskz_unpacklo_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
@@ -7686,6 +7708,8 @@ __m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   return _mm_maskz_unpacklo_pd(__U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m128d(_mm_maskz_unpacklo_pd(0x2, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 0.0,3.0));
+
 __m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: test_mm256_mask_unpacklo_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -7693,6 +7717,8 @@ __m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2
   return _mm256_mask_unpacklo_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xAA, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0,5.0,0,7.0));
+
 __m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: test_mm256_maskz_unpacklo_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -7700,6 +7726,8 @@ __m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   return _mm256_maskz_unpacklo_pd(__U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_maskz_unpacklo_pd(0x0A, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0.0,5.0,0.0,7.0));
+
 __m128 test_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: test_mm_mask_unpacklo_ps
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -8039,6 +8067,8 @@ __m128i test_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m1
   return _mm_mask_unpackhi_epi32(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_unpackhi_epi32(_mm_setzero_si128(), 0xA, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 0,6,0,7));
+
 __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_epi32
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -8046,6 +8076,8 @@ __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   return _mm_maskz_unpackhi_epi32(__U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_unpackhi_epi32(0x5, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 2,0,3,0));
+
 __m256i test_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpackhi_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -8997,6 +9029,8 @@ __m256 test_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) {
   return _mm256_maskz_broadcast_f32x4(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xAA, (__m128)(__v4sf){0,1,2,3}), 0,1,0,3,0,1,0,3));
+
 __m256i test_mm256_broadcast_i32x4(__m128i const* __A) {
   // CHECK-LABEL: test_mm256_broadcast_i32x4
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -9018,6 +9052,8 @@ __m256i test_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i const* __A) {
   return _mm256_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,3,0,1,0,3));
+
 __m256d test_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) {
   // CHECK-LABEL: test_mm256_mask_broadcastsd_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
@@ -10324,6 +10360,8 @@ __m128 test_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return _mm_mask_movehdup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m128(_mm_mask_movehdup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 2.f,2.f,4.f,4.f));
+
 __m128 test_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: test_mm_maskz_movehdup_ps
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -10338,6 +10376,8 @@ __m256 test_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return _mm256_mask_movehdup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xAA, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 0,2,0,4,0,6,0,8));
+
 __m256 test_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_maskz_movehdup_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -10352,6 +10392,8 @@ __m128 test_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return _mm_mask_moveldup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m128(_mm_mask_moveldup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,1.f,3.f,3.f));
+
 __m128 test_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: test_mm_maskz_moveldup_ps
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -10366,6 +10408,8 @@ __m256 test_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return _mm256_mask_moveldup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xAA, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 0,1,0,3,0,5,0,7));
+
 __m256 test_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_maskz_moveldup_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c
index 9388457..4773b60 100644
--- a/clang/test/CodeGen/X86/avx512vldq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c
@@ -987,6 +987,8 @@ __m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
   return _mm256_mask_broadcast_f32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xAA, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 0,2.f,0,2.f,0,2.f,0,2.f));
+
 __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -994,6 +996,8 @@ __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
   return _mm256_maskz_broadcast_f32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xAA, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 0,2.f,0,2.f,0,2.f,0,2.f));
+
 __m256d test_mm256_broadcast_f64x2(double const* __A) {
   // CHECK-LABEL: test_mm256_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1008,6 +1012,8 @@ __m256d test_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, double const*
   return _mm256_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xA, (__m128d)(__v2df){1.0,2.0}), 0,2.0,0,2.0));
+
 __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1015,6 +1021,8 @@ __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   return _mm256_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xA, (__m128d)(__v2df){1.0,2.0}), 0,2.0,0,2.0));
+
 __m128i test_mm_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: test_mm_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1029,6 +1037,8 @@ __m128i test_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
   return _mm_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1));
+
 __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1036,6 +1046,8 @@ __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return _mm_maskz_broadcast_i32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_broadcast_i32x2(0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1));
+
 __m256i test_mm256_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: test_mm256_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1050,6 +1062,8 @@ __m256i test_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A)
   return _mm256_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_mask_broadcast_i32x2(_mm256_setzero_si256(), 0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1));
+
 __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1057,6 +1071,8 @@ __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return _mm256_maskz_broadcast_i32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1));
+
 __m256i test_mm256_broadcast_i64x2(__m128i const* __A) {
   // CHECK-LABEL: test_mm256_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1078,6 +1094,8 @@ __m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) {
   return _mm256_maskz_broadcast_i64x2(__M, _mm_loadu_si128(__A)); 
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, (__m128i)(__v2di){1,2}), 1,2,1,2));
+
 __m128d test_mm256_extractf64x2_pd(__m256d __A) {
   // CHECK-LABEL: test_mm256_extractf64x2_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
index 2dde75f..50f7011 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
@@ -8,8 +8,8 @@
 
 // CHECK-LABEL: @test_vld1q_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vld1q_f16(const float16_t *base)
 {
@@ -22,8 +22,8 @@ float16x8_t test_vld1q_f16(const float16_t *base)
 
 // CHECK-LABEL: @test_vld1q_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vld1q_f32(const float32_t *base)
 {
@@ -36,8 +36,8 @@ float32x4_t test_vld1q_f32(const float32_t *base)
 
 // CHECK-LABEL: @test_vld1q_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 int8x16_t test_vld1q_s8(const int8_t *base)
 {
@@ -50,8 +50,8 @@ int8x16_t test_vld1q_s8(const int8_t *base)
 
 // CHECK-LABEL: @test_vld1q_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vld1q_s16(const int16_t *base)
 {
@@ -64,8 +64,8 @@ int16x8_t test_vld1q_s16(const int16_t *base)
 
 // CHECK-LABEL: @test_vld1q_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vld1q_s32(const int32_t *base)
 {
@@ -78,8 +78,8 @@ int32x4_t test_vld1q_s32(const int32_t *base)
 
 // CHECK-LABEL: @test_vld1q_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vld1q_u8(const uint8_t *base)
 {
@@ -92,8 +92,8 @@ uint8x16_t test_vld1q_u8(const uint8_t *base)
 
 // CHECK-LABEL: @test_vld1q_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 uint16x8_t test_vld1q_u16(const uint16_t *base)
 {
@@ -106,8 +106,8 @@ uint16x8_t test_vld1q_u16(const uint16_t *base)
 
 // CHECK-LABEL: @test_vld1q_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vld1q_u32(const uint32_t *base)
 {
@@ -120,10 +120,10 @@ uint32x4_t test_vld1q_u32(const uint32_t *base)
 
 // CHECK-LABEL: @test_vld1q_z_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vld1q_z_f16(const float16_t *base, mve_pred16_t p)
 {
@@ -136,10 +136,10 @@ float16x8_t test_vld1q_z_f16(const float16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vld1q_z_f32(const float32_t *base, mve_pred16_t p)
 {
@@ -152,10 +152,10 @@ float32x4_t test_vld1q_z_f32(const float32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vld1q_z_s8(const int8_t *base, mve_pred16_t p)
 {
@@ -168,10 +168,10 @@ int8x16_t test_vld1q_z_s8(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vld1q_z_s16(const int16_t *base, mve_pred16_t p)
 {
@@ -184,10 +184,10 @@ int16x8_t test_vld1q_z_s16(const int16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vld1q_z_s32(const int32_t *base, mve_pred16_t p)
 {
@@ -200,10 +200,10 @@ int32x4_t test_vld1q_z_s32(const int32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vld1q_z_u8(const uint8_t *base, mve_pred16_t p)
 {
@@ -216,10 +216,10 @@ uint8x16_t test_vld1q_z_u8(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vld1q_z_u16(const uint16_t *base, mve_pred16_t p)
 {
@@ -232,10 +232,10 @@ uint16x8_t test_vld1q_z_u16(const uint16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vld1q_z_u32(const uint32_t *base, mve_pred16_t p)
 {
@@ -248,8 +248,8 @@ uint32x4_t test_vld1q_z_u32(const uint32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 int8x16_t test_vldrbq_s8(const int8_t *base)
 {
@@ -258,9 +258,9 @@ int8x16_t test_vldrbq_s8(const int8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 //
 int16x8_t test_vldrbq_s16(const int8_t *base)
 {
@@ -269,9 +269,9 @@ int16x8_t test_vldrbq_s16(const int8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 int32x4_t test_vldrbq_s32(const int8_t *base)
 {
@@ -280,8 +280,8 @@ int32x4_t test_vldrbq_s32(const int8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vldrbq_u8(const uint8_t *base)
 {
@@ -290,9 +290,9 @@ uint8x16_t test_vldrbq_u8(const uint8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 //
 uint16x8_t test_vldrbq_u16(const uint8_t *base)
 {
@@ -301,9 +301,9 @@ uint16x8_t test_vldrbq_u16(const uint8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 uint32x4_t test_vldrbq_u32(const uint8_t *base)
 {
@@ -312,10 +312,10 @@ uint32x4_t test_vldrbq_u32(const uint8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_z_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vldrbq_z_s8(const int8_t *base, mve_pred16_t p)
 {
@@ -324,11 +324,11 @@ int8x16_t test_vldrbq_z_s8(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 int16x8_t test_vldrbq_z_s16(const int8_t *base, mve_pred16_t p)
 {
@@ -337,11 +337,11 @@ int16x8_t test_vldrbq_z_s16(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 int32x4_t test_vldrbq_z_s32(const int8_t *base, mve_pred16_t p)
 {
@@ -350,10 +350,10 @@ int32x4_t test_vldrbq_z_s32(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vldrbq_z_u8(const uint8_t *base, mve_pred16_t p)
 {
@@ -362,11 +362,11 @@ uint8x16_t test_vldrbq_z_u8(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP1]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 uint16x8_t test_vldrbq_z_u16(const uint8_t *base, mve_pred16_t p)
 {
@@ -375,11 +375,11 @@ uint16x8_t test_vldrbq_z_u16(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP1]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 uint32x4_t test_vldrbq_z_u32(const uint8_t *base, mve_pred16_t p)
 {
@@ -388,8 +388,8 @@ uint32x4_t test_vldrbq_z_u32(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vldrhq_f16(const float16_t *base)
 {
@@ -398,8 +398,8 @@ float16x8_t test_vldrhq_f16(const float16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vldrhq_s16(const int16_t *base)
 {
@@ -408,9 +408,9 @@ int16x8_t test_vldrhq_s16(const int16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 int32x4_t test_vldrhq_s32(const int16_t *base)
 {
@@ -419,8 +419,8 @@ int32x4_t test_vldrhq_s32(const int16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 uint16x8_t test_vldrhq_u16(const uint16_t *base)
 {
@@ -429,9 +429,9 @@ uint16x8_t test_vldrhq_u16(const uint16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 uint32x4_t test_vldrhq_u32(const uint16_t *base)
 {
@@ -440,10 +440,10 @@ uint32x4_t test_vldrhq_u32(const uint16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_z_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vldrhq_z_f16(const float16_t *base, mve_pred16_t p)
 {
@@ -452,10 +452,10 @@ float16x8_t test_vldrhq_z_f16(const float16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vldrhq_z_s16(const int16_t *base, mve_pred16_t p)
 {
@@ -464,11 +464,11 @@ int16x8_t test_vldrhq_z_s16(const int16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP1]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 int32x4_t test_vldrhq_z_s32(const int16_t *base, mve_pred16_t p)
 {
@@ -477,10 +477,10 @@ int32x4_t test_vldrhq_z_s32(const int16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vldrhq_z_u16(const uint16_t *base, mve_pred16_t p)
 {
@@ -489,11 +489,11 @@ uint16x8_t test_vldrhq_z_u16(const uint16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP1]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 uint32x4_t test_vldrhq_z_u32(const uint16_t *base, mve_pred16_t p)
 {
@@ -502,8 +502,8 @@ uint32x4_t test_vldrhq_z_u32(const uint16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrwq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vldrwq_f32(const float32_t *base)
 {
@@ -512,8 +512,8 @@ float32x4_t test_vldrwq_f32(const float32_t *base)
 
 // CHECK-LABEL: @test_vldrwq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vldrwq_s32(const int32_t *base)
 {
@@ -522,8 +522,8 @@ int32x4_t test_vldrwq_s32(const int32_t *base)
 
 // CHECK-LABEL: @test_vldrwq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vldrwq_u32(const uint32_t *base)
 {
@@ -532,10 +532,10 @@ uint32x4_t test_vldrwq_u32(const uint32_t *base)
 
 // CHECK-LABEL: @test_vldrwq_z_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vldrwq_z_f32(const float32_t *base, mve_pred16_t p)
 {
@@ -544,10 +544,10 @@ float32x4_t test_vldrwq_z_f32(const float32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrwq_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vldrwq_z_s32(const int32_t *base, mve_pred16_t p)
 {
@@ -556,10 +556,10 @@ int32x4_t test_vldrwq_z_s32(const int32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrwq_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vldrwq_z_u32(const uint32_t *base, mve_pred16_t p)
 {
@@ -680,9 +680,9 @@ void test_vst1q_u32(uint32_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vst1q_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
@@ -696,9 +696,9 @@ void test_vst1q_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
@@ -712,9 +712,9 @@ void test_vst1q_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
@@ -728,9 +728,9 @@ void test_vst1q_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
@@ -744,9 +744,9 @@ void test_vst1q_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
@@ -760,9 +760,9 @@ void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
@@ -776,9 +776,9 @@ void test_vst1q_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
@@ -792,9 +792,9 @@ void test_vst1q_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
@@ -896,9 +896,9 @@ void test_vstrbq_u32(uint8_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vstrbq_p_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
@@ -913,9 +913,9 @@ void test_vstrbq_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_s16(int8_t *base, int16x8_t value, mve_pred16_t p)
@@ -930,9 +930,9 @@ void test_vstrbq_p_s16(int8_t *base, int16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_s32(int8_t *base, int32x4_t value, mve_pred16_t p)
@@ -946,9 +946,9 @@ void test_vstrbq_p_s32(int8_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrbq_p_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
@@ -963,9 +963,9 @@ void test_vstrbq_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_u16(uint8_t *base, uint16x8_t value, mve_pred16_t p)
@@ -980,9 +980,9 @@ void test_vstrbq_p_u16(uint8_t *base, uint16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_u32(uint8_t *base, uint32x4_t value, mve_pred16_t p)
@@ -1068,9 +1068,9 @@ void test_vstrhq_u32(uint16_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vstrhq_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
@@ -1084,9 +1084,9 @@ void test_vstrhq_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrhq_p_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
@@ -1101,9 +1101,9 @@ void test_vstrhq_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrhq_p_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_s32(int16_t *base, int32x4_t value, mve_pred16_t p)
@@ -1117,9 +1117,9 @@ void test_vstrhq_p_s32(int16_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrhq_p_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
@@ -1134,9 +1134,9 @@ void test_vstrhq_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrhq_p_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_u32(uint16_t *base, uint32x4_t value, mve_pred16_t p)
@@ -1192,9 +1192,9 @@ void test_vstrwq_u32(uint32_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vstrwq_p_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrwq_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
@@ -1208,9 +1208,9 @@ void test_vstrwq_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrwq_p_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrwq_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
@@ -1224,9 +1224,9 @@ void test_vstrwq_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrwq_p_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrwq_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
diff --git a/clang/test/CodeGen/attr-target-mv.c b/clang/test/CodeGen/attr-target-mv.c
index 607e3e4..b8807dd 100644
--- a/clang/test/CodeGen/attr-target-mv.c
+++ b/clang/test/CodeGen/attr-target-mv.c
@@ -31,6 +31,7 @@ int __attribute__((target("arch=pantherlake"))) foo(void) {return 25;}
 int __attribute__((target("arch=clearwaterforest"))) foo(void) {return 26;}
 int __attribute__((target("arch=diamondrapids"))) foo(void) {return 27;}
 int __attribute__((target("arch=wildcatlake"))) foo(void) {return 28;}
+int __attribute__((target("arch=novalake"))) foo(void) {return 29;}
 int __attribute__((target("default"))) foo(void) { return 2; }
 
 int bar(void) {
@@ -206,6 +207,8 @@ void calls_pr50025c(void) { pr50025c(); }
 // ITANIUM: ret i32 27
 // ITANIUM: define{{.*}} i32 @foo.arch_wildcatlake()
 // ITANIUM: ret i32 28
+// ITANIUM: define{{.*}} i32 @foo.arch_novalake()
+// ITANIUM: ret i32 29
 // ITANIUM: define{{.*}} i32 @foo()
 // ITANIUM: ret i32 2
 // ITANIUM: define{{.*}} i32 @bar()
@@ -267,6 +270,8 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: ret i32 27
 // WINDOWS: define dso_local i32 @foo.arch_wildcatlake()
 // WINDOWS: ret i32 28
+// WINDOWS: define dso_local i32 @foo.arch_novalake()
+// WINDOWS: ret i32 29
 // WINDOWS: define dso_local i32 @foo()
 // WINDOWS: ret i32 2
 // WINDOWS: define dso_local i32 @bar()
diff --git a/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll b/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll
index 08c1a29..90cda3e 100644
--- a/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll
+++ b/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll
@@ -22,7 +22,7 @@
 
 ; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=%t.o.thinlto.bc -save-temps=obj
 
-; RUN: llvm-dis %t.s.3.import.bc -o - | FileCheck %s --check-prefix=CHECK-IR
+; RUN: llvm-dis %t.s.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR
 ; CHECK-IR: !memprof {{.*}} !callsite
 ; CHECK-IR: "memprof"="cold"
 
@@ -42,10 +42,15 @@
 
 ; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=%t.o.thinlto.bc -save-temps=obj
 
-; RUN: llvm-dis %t.s.3.import.bc -o - | FileCheck %s \
+; RUN: llvm-dis %t.s.4.opt.bc -o - | FileCheck %s \
 ; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
 ; RUN: --implicit-check-not "memprof"="cold"
 
+;; Ensure the attributes and metadata are stripped when running a non-LTO pipeline.
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -x ir %t.o -S -emit-llvm -o - | FileCheck %s \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
+
 source_filename = "thinlto-distributed-supports-hot-cold-new.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c
index 2c0d83c..47d5ae5 100644
--- a/clang/test/CodeGen/target-builtin-noerror.c
+++ b/clang/test/CodeGen/target-builtin-noerror.c
@@ -179,6 +179,7 @@ void verifycpustrings(void) {
   (void)__builtin_cpu_is("clearwaterforest");
   (void)__builtin_cpu_is("pantherlake");
   (void)__builtin_cpu_is("wildcatlake");
+  (void)__builtin_cpu_is("novalake");
   (void)__builtin_cpu_is("haswell");
   (void)__builtin_cpu_is("icelake-client");
   (void)__builtin_cpu_is("icelake-server");
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
index 43ddd2e..2cf6a10 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
@@ -1,64 +1,112 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,DXIL
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,SPV
 
 // NOTE: SPIRV codegen for resource methods is not yet implemented
 
 StructuredBuffer<float> SB1 : register(t0);
 RWStructuredBuffer<float> RWSB1 : register(u0);
-RWStructuredBuffer<float> RWSB2 : register(u1);
+RWStructuredBuffer<uint4> RWSB2 : register(u1);
 AppendStructuredBuffer<float> ASB : register(u2);
-ConsumeStructuredBuffer<float> CSB : register(u3);
+ConsumeStructuredBuffer<double> CSB : register(u3);
 
-// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// DXIL: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
+// DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// DXIL: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", <4 x i32>, 1, 0), target("dx.RawBuffer", <4 x i32>, 1, 0) }
+// DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// DXIL: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", double, 1, 0), target("dx.RawBuffer", double, 1, 0) }
 
 export int TestIncrementCounter() {
     return RWSB1.IncrementCounter();
 }
 
-// CHECK: define noundef i32 @_Z20TestIncrementCounterv()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
-// CHECK-DXIL: ret i32 %[[INDEX]]
+// CHECK: define noundef i32 @TestIncrementCounter()()
+// CHECK: call noundef i32 @hlsl::RWStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} @RWSB1)
+// CHECK: ret
+
+// CHECK: define {{.*}} noundef i32 @hlsl::RWStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[COUNTER_HANDLE]], i8 1)
+// CHECK-NEXT:  ret i32 %[[COUNTER]]
+
 export int TestDecrementCounter() {
     return RWSB2.DecrementCounter();
 }
+// CHECK: define {{.*}} i32 @TestDecrementCounter()()
+// CHECK: call noundef i32 @hlsl::RWStructuredBuffer<unsigned int vector[4]>::DecrementCounter()(ptr {{.*}} @RWSB2)
+// CHECK: ret
 
-// CHECK: define noundef i32 @_Z20TestDecrementCounterv()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
-// CHECK-DXIL: ret i32 %[[INDEX]]
+// CHECK: define {{.*}} noundef i32 @hlsl::RWStructuredBuffer<unsigned int vector[4]>::DecrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer.0", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", <4 x i32>, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v4i32_1_0t(target("dx.RawBuffer", <4 x i32>, 1, 0) %[[COUNTER_HANDLE]], i8 -1)
+// CHECK-NEXT: ret i32 %[[COUNTER]]
 
 export void TestAppend(float value) {
     ASB.Append(value);
 }
 
-// CHECK: define void @_Z10TestAppendf(float noundef nofpclass(nan inf) %value)
-// CHECK-DXIL: %[[VALUE:.*]] = load float, ptr %value.addr, align 4
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
-// CHECK-DXIL: %[[RESPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i32 %[[INDEX]])
-// CHECK-DXIL: store float %[[VALUE]], ptr %[[RESPTR]], align 4
+// CHECK: define void @TestAppend(float)(float {{.*}} %value)
+// CHECK: call void @hlsl::AppendStructuredBuffer<float>::Append(float)(ptr {{.*}} @ASB, float noundef nofpclass(nan inf) %0)
+// CHECK: ret void
+
+// CHECK: define {{.*}} void @hlsl::AppendStructuredBuffer<float>::Append(float)(ptr {{.*}} %this, float noundef nofpclass(nan inf) %value)
+// CHECK: %[[VALUE:.*]] = load float, ptr %value.addr
+// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__handle
+// CHECK-NEXT: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[COUNTER_HANDLE]], i8 1)
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[HANDLE]], i32 %[[COUNTER]])
+// CHECK-NEXT: store float %[[VALUE]], ptr %[[PTR]]
+// CHECK-NEXT: ret void
 
-export float TestConsume() {
+export double TestConsume() {
     return CSB.Consume();
 }
-
-// CHECK: define noundef nofpclass(nan inf) float @_Z11TestConsumev()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %1, i8 -1)
-// CHECK-DXIL: %[[RESPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %0, i32 %[[INDEX]])
-// CHECK-DXIL: %[[VALUE:.*]] = load float, ptr %[[RESPTR]], align 4
-// CHECK-DXIL: ret float %[[VALUE]]
+// CHECK: define {{.*}} double @TestConsume()()
+// CHECK: call {{.*}} double @hlsl::ConsumeStructuredBuffer<double>::Consume()(ptr {{.*}} @CSB)
+// CHECK: ret double
+    
+// CHECK: define {{.*}} double @hlsl::ConsumeStructuredBuffer<double>::Consume()(ptr {{.*}} %this)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::ConsumeStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", double, 1, 0), ptr %__handle
+// CHECK-NEXT: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::ConsumeStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", double, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0) %[[COUNTER_HANDLE]], i8 -1)
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0) %[[HANDLE]], i32 %[[COUNTER]])
+// CHECK-NEXT: %[[VAL:.*]] = load double, ptr %[[PTR]], align 8
+// CHECK-NEXT: ret double %[[VAL]]
 
 export float TestLoad() {
     return RWSB1.Load(1) + SB1.Load(2);
 }
 
-// CHECK: define noundef nofpclass(nan inf) float @_Z8TestLoadv()
-// CHECK: %[[PTR1:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i32 %{{[0-9]+}})
-// CHECK: %[[VALUE1:.*]] = load float, ptr %[[PTR1]]
-// CHECK: %[[PTR2:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0) %{{[0-9]+}}, i32 %{{[0-9]+}})
-// CHECK: %[[VALUE2:.*]] = load float, ptr %[[PTR2]]
+// CHECK: define noundef nofpclass(nan inf) float @TestLoad()()
+// CHECK: call {{.*}} float @hlsl::RWStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} @RWSB1, i32 noundef 1)
+// CHECK: call {{.*}} float @hlsl::StructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} @SB1, i32 noundef 2)
+// CHECK: add
+// CHECK: ret float
+
+// CHECK: define {{.*}} float @hlsl::RWStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} %this, i32 noundef %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load float, ptr %[[PTR]]
+// CHECK-NEXT: ret float %[[VAL]]
+
+// CHECK: define {{.*}} float @hlsl::StructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} %this, i32 noundef %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 0, 0), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load float, ptr %[[PTR]]
+// CHECK-NEXT: ret float %[[VAL]]
 
-// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
-// CHECK: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i32)
-// CHECK: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0), i32)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v4i32_1_0t(target("dx.RawBuffer", <4 x i32>, 1, 0), i8)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i32)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0), i8)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0), i32)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0), i32)
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
index 9e08a6d..47c1d0d 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
@@ -1,37 +1,71 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,DXIL
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-pixel -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,SPV
 
 // NOTE: SPIRV codegen for resource methods is not yet implemented
 
-RWStructuredBuffer<float> RWSB1, RWSB2;
-RasterizerOrderedStructuredBuffer<float> ROSB1, ROSB2;
+RasterizerOrderedStructuredBuffer<float> ROSB1;
+RasterizerOrderedStructuredBuffer<int2> ROSB2;
 
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 1), target("dx.RawBuffer", float, 1, 1) }
+// %"class.hlsl::RasterizerOrderedStructuredBuffer.0" = type { target("dx.RawBuffer", <2 x i32>, 1, 1), target("dx.RawBuffer", <2 x i32>, 1, 1) }
+
+// CHECK: @ROSB1 = internal global %"class.hlsl::RasterizerOrderedStructuredBuffer" poison
+// CHECK: @ROSB2 = internal global %"class.hlsl::RasterizerOrderedStructuredBuffer.0" poison
 
 export void TestIncrementCounter() {
-// CHECK: define void @_Z20TestIncrementCounterv()
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 1)
-    RWSB1.IncrementCounter();
     ROSB1.IncrementCounter();
 }
 
+// CHECK: define void @TestIncrementCounter()()
+// CHECK: call noundef i32 @hlsl::RasterizerOrderedStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} @ROSB1)
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} i32 @hlsl::RasterizerOrderedStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// CHECK-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 1), ptr %__counter_handle
+// DXIL-NEXT: %[[VAL:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %[[COUNTER_HANDLE]], i8 1)
+// CHECK-NEXT: ret i32 %[[VAL]]
+
 export void TestDecrementCounter() {
-// CHECK: define void @_Z20TestDecrementCounterv()
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 -1)
-    RWSB2.DecrementCounter();
     ROSB2.DecrementCounter();
 }
 
+// CHECK: define void @TestDecrementCounter()()
+// CHECK: call noundef i32 @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::DecrementCounter()(ptr {{.*}} @ROSB2)
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} i32 @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::DecrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer.0", ptr %{{.*}}, i32 0, i32 1
+// CHECK-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", <2 x i32>, 1, 1), ptr %__counter_handle
+// DXIL-NEXT: %[[VAL:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1) %[[COUNTER_HANDLE]], i8 -1)
+// CHECK-NEXT: ret i32 %[[VAL]]
+
 export float TestLoad() {
-    return ROSB1.Load(10);
+    return ROSB1.Load(10).x + ROSB2.Load(20).x;
 }
 
-// CHECK: define noundef nofpclass(nan inf) float @_Z8TestLoadv()
-// CHECK: %[[PTR1:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i32 %{{[0-9]+}})
-// CHECK: %[[VALUE1:.*]] = load float, ptr %[[PTR1]]
+// CHECK: define {{.*}} float @TestLoad()()
+// CHECK: call {{.*}} float @hlsl::RasterizerOrderedStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} @ROSB1, i32 noundef 10)
+// CHECK: call {{.*}} <2 x i32> @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::Load(unsigned int)(ptr {{.*}} @ROSB2, i32 noundef 20)
+// CHECK: ret
+
+// CHECK: define {{.*}} float @hlsl::RasterizerOrderedStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer", ptr {{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 1), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[BUFPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load float, ptr %[[BUFPTR]]
+// CHECK-NEXT: ret float %[[VAL]]
+
+// CHECK: define {{.*}} <2 x i32> @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::Load(unsigned int)(ptr {{.*}} %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer.0", ptr {{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", <2 x i32>, 1, 1), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[BUFPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load <2 x i32>, ptr %[[BUFPTR]]
+// CHECK-NEXT: ret <2 x i32> %[[VAL]]
 
-// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
-// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i8)
-// CHECK: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i32)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i8)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1), i8)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i32)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1), i32)
diff --git a/clang/test/Driver/x86-march.c b/clang/test/Driver/x86-march.c
index 24404ff..15f8254 100644
--- a/clang/test/Driver/x86-march.c
+++ b/clang/test/Driver/x86-march.c
@@ -120,6 +120,10 @@
 // RUN:   | FileCheck %s -check-prefix=wildcatlake
 // wildcatlake: "-target-cpu" "wildcatlake"
 //
+// RUN: %clang --target=x86_64 -c -### %s -march=novalake 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=novalake
+// novalake: "-target-cpu" "novalake"
+//
 // RUN: %clang --target=x86_64 -c -### %s -march=clearwaterforest 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=clearwaterforest
 // clearwaterforest: "-target-cpu" "clearwaterforest"
diff --git a/clang/test/FixIt/fixit-constrained-structured-binding.cpp b/clang/test/FixIt/fixit-constrained-structured-binding.cpp
index 3f21c1d..bfb004f 100644
--- a/clang/test/FixIt/fixit-constrained-structured-binding.cpp
+++ b/clang/test/FixIt/fixit-constrained-structured-binding.cpp
@@ -14,20 +14,20 @@ T get_T();
 
 void use() {
   UnaryC auto [a, b] = get_S();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // CHECK: fix-it:{{.*}}:{16:3-16:10}:""
   BinaryC<int> auto [c, d] = get_S();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // CHECK: fix-it:{{.*}}:{19:3-19:16}:""
 }
 
 template<typename T>
 void TemplUse() {
   UnaryC auto [a, b] = get_T<T>();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // XCHECK: fix-it:{{.*}}:{26:3-26:10}:""
   BinaryC<T> auto [c, d] = get_T<T>();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // XCHECK: fix-it:{{.*}}:{29:3-29:14}:""
 }
 
diff --git a/clang/test/Misc/target-invalid-cpu-note/x86.c b/clang/test/Misc/target-invalid-cpu-note/x86.c
index 3906318..4a70e9b 100644
--- a/clang/test/Misc/target-invalid-cpu-note/x86.c
+++ b/clang/test/Misc/target-invalid-cpu-note/x86.c
@@ -64,6 +64,7 @@
 // X86-SAME: {{^}}, gracemont
 // X86-SAME: {{^}}, pantherlake
 // X86-SAME: {{^}}, wildcatlake
+// X86-SAME: {{^}}, novalake
 // X86-SAME: {{^}}, sierraforest
 // X86-SAME: {{^}}, grandridge
 // X86-SAME: {{^}}, graniterapids
@@ -152,6 +153,7 @@
 // X86_64-SAME: {{^}}, gracemont
 // X86_64-SAME: {{^}}, pantherlake
 // X86_64-SAME: {{^}}, wildcatlake
+// X86_64-SAME: {{^}}, novalake
 // X86_64-SAME: {{^}}, sierraforest
 // X86_64-SAME: {{^}}, grandridge
 // X86_64-SAME: {{^}}, graniterapids
@@ -249,6 +251,7 @@
 // TUNE_X86-SAME: {{^}}, gracemont
 // TUNE_X86-SAME: {{^}}, pantherlake
 // TUNE_X86-SAME: {{^}}, wildcatlake
+// TUNE_X86-SAME: {{^}}, novalake
 // TUNE_X86-SAME: {{^}}, sierraforest
 // TUNE_X86-SAME: {{^}}, grandridge
 // TUNE_X86-SAME: {{^}}, graniterapids
@@ -353,6 +356,7 @@
 // TUNE_X86_64-SAME: {{^}}, gracemont
 // TUNE_X86_64-SAME: {{^}}, pantherlake
 // TUNE_X86_64-SAME: {{^}}, wildcatlake
+// TUNE_X86_64-SAME: {{^}}, novalake
 // TUNE_X86_64-SAME: {{^}}, sierraforest
 // TUNE_X86_64-SAME: {{^}}, grandridge
 // TUNE_X86_64-SAME: {{^}}, graniterapids
diff --git a/clang/test/PCH/cxx1z-decomposition.cpp b/clang/test/PCH/cxx1z-decomposition.cpp
index 914ce80..340d5ea 100644
--- a/clang/test/PCH/cxx1z-decomposition.cpp
+++ b/clang/test/PCH/cxx1z-decomposition.cpp
@@ -22,7 +22,7 @@ constexpr int foo(Q &&q) {
   return a * 10 + b;
 }
 
-auto [noinit]; // expected-error{{decomposition declaration '[noinit]' requires an initializer}}
+auto [noinit]; // expected-error{{structured binding declaration '[noinit]' requires an initializer}}
 
 #else
 
@@ -31,7 +31,7 @@ int k = decomp(arr);
 
 static_assert(foo({1, 2}) == 12);
 
-// expected-error@15 {{cannot decompose non-class, non-array type 'const int'}}
+// expected-error@15 {{cannot bind non-class, non-array type 'const int'}}
 int z = decomp(10); // expected-note {{instantiation of}}
 
 #endif
diff --git a/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp b/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp
index 9d27f83..ece00a0 100644
--- a/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp
+++ b/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp
@@ -158,7 +158,7 @@ namespace decl {
   A arr[3] = 0; // expected-error {{cannot form array of deduced class template specialization type}}
   A F::*pm = 0; // expected-error {{cannot form pointer to deduced class template specialization type}}
   A (*fp)() = 0; // expected-error {{cannot form function returning deduced class template specialization type}}
-  A [x, y] = 0; // expected-error {{cannot be declared with type 'A'}} expected-error {{type 'A<int>' decomposes into 0 elements, but 2 names were provided}}
+  A [x, y] = 0; // expected-error {{cannot be declared with type 'A'}} expected-error {{type 'A<int>' binds to 0 elements, but 2 names were provided}}
 }
 
 namespace typename_specifier {
@@ -185,7 +185,7 @@ namespace typename_specifier {
   typename ::A arr[3] = 0; // expected-error {{cannot form array of deduced class template specialization type}}
   typename ::A F::*pm = 0; // expected-error {{cannot form pointer to deduced class template specialization type}}
   typename ::A (*fp)() = 0; // expected-error {{cannot form function returning deduced class template specialization type}}
-  typename ::A [x, y] = 0; // expected-error {{cannot be declared with type 'typename ::A'}} expected-error {{type 'typename ::A<int>' (aka 'A<int>') decomposes into 0}}
+  typename ::A [x, y] = 0; // expected-error {{cannot be declared with type 'typename ::A'}} expected-error {{type 'typename ::A<int>' (aka 'A<int>') binds to 0}}
 
   struct X { template<typename T> struct A { A(T); }; }; // expected-note 8{{declared here}}
 
@@ -208,7 +208,7 @@ namespace typename_specifier {
     {typename T::A arr[3] = 0;} // expected-error {{refers to class template member}}
     {typename T::A F::*pm = 0;} // expected-error {{refers to class template member}}
     {typename T::A (*fp)() = 0;} // expected-error {{refers to class template member}}
-    {typename T::A [x, y] = 0;} // expected-error {{cannot be declared with type 'typename T::A'}} expected-error {{type 'typename typename_specifier::X::A<int>' (aka 'typename_specifier::X::A<int>') decomposes into 0}}
+    {typename T::A [x, y] = 0;} // expected-error {{cannot be declared with type 'typename T::A'}} expected-error {{type 'typename typename_specifier::X::A<int>' (aka 'typename_specifier::X::A<int>') binds to 0}}
   }
   template void f<X>(); // expected-note {{instantiation of}}
 
diff --git a/clang/test/Parser/cxx1z-decomposition.cpp b/clang/test/Parser/cxx1z-decomposition.cpp
index 274e24e..21c9419 100644
--- a/clang/test/Parser/cxx1z-decomposition.cpp
+++ b/clang/test/Parser/cxx1z-decomposition.cpp
@@ -5,7 +5,7 @@
 
 struct S { int a, b, c; }; // expected-note 2 {{'S::a' declared here}}
 
-// A simple-declaration can be a decompsition declaration.
+// A simple-declaration can be a structured binding declaration.
 namespace SimpleDecl {
   auto [a_x, b_x, c_x] = S();
 
@@ -19,7 +19,7 @@ namespace SimpleDecl {
   }
 }
 
-// A for-range-declaration can be a decomposition declaration.
+// A for-range-declaration can be a structured binding declaration.
 namespace ForRangeDecl {
   extern S arr[10];
   void h() {
@@ -100,12 +100,12 @@ namespace BadSpecifiers {
   inline auto &[k] = n; // expected-error {{cannot be declared 'inline'}}
 
   const int K = 5;
-  auto ([c]) = s; // expected-error {{decomposition declaration cannot be declared with parentheses}}
+  auto ([c]) = s; // expected-error {{structured binding declaration cannot be declared with parentheses}}
   void g() {
     // defining-type-specifiers other than cv-qualifiers and 'auto'
     S [a] = s; // expected-error {{cannot be declared with type 'S'}}
     decltype(auto) [b] = s; // expected-error {{cannot be declared with type 'decltype(auto)'}}
-    auto ([c2]) = s; // cxx17-error {{decomposition declaration cannot be declared with parenthese}} \
+    auto ([c2]) = s; // cxx17-error {{structured binding declaration cannot be declared with parenthese}} \
                      // post2b-error {{use of undeclared identifier 'c2'}} \
                      // post2b-error {{expected body of lambda expression}} \
 
@@ -114,7 +114,7 @@ namespace BadSpecifiers {
     auto [e][1] = s; // expected-error {{expected ';'}} expected-error {{requires an initializer}}
 
     // FIXME: This should fire the 'misplaced array declarator' diagnostic.
-    int [K] arr = {0}; // expected-error {{expected ';'}} expected-error {{cannot be declared with type 'int'}} expected-error {{decomposition declaration '[K]' requires an initializer}}
+    int [K] arr = {0}; // expected-error {{expected ';'}} expected-error {{cannot be declared with type 'int'}} expected-error {{structured binding declaration '[K]' requires an initializer}}
     int [5] arr = {0}; // expected-error {{place the brackets after the name}}
 
     auto *[f] = s; // expected-error {{cannot be declared with type 'auto *'}} expected-error {{incompatible initializer}}
@@ -145,14 +145,14 @@ namespace MultiDeclarator {
 namespace Template {
   int n[3];
   // Structured binding template is not allowed.
-  template<typename T> auto [a, b, c] = n; // expected-error {{decomposition declaration cannot be a template}}
+  template<typename T> auto [a, b, c] = n; // expected-error {{structured binding declaration cannot be a template}}
 }
 
 namespace Init {
   void f() {
     int arr[1];
     struct S { int n; };
-    auto &[bad1]; // expected-error {{decomposition declaration '[bad1]' requires an initializer}}
+    auto &[bad1]; // expected-error {{structured binding declaration '[bad1]' requires an initializer}}
     const auto &[bad2](S{}, S{}); // expected-error {{initializer for variable '[bad2]' with type 'const auto &' contains multiple expressions}}
     const auto &[bad3](); // expected-error {{expected expression}}
     auto &[good1] = arr;
diff --git a/clang/test/Parser/cxx2c-binding-pack.cpp b/clang/test/Parser/cxx2c-binding-pack.cpp
index 0daaad3..40d843e 100644
--- a/clang/test/Parser/cxx2c-binding-pack.cpp
+++ b/clang/test/Parser/cxx2c-binding-pack.cpp
@@ -12,7 +12,7 @@ void decompose_array() {
   auto [...] = arr; // #2
                     // expected-error@#2{{expected identifier}}
                     // expected-error@#2{{{no names were provided}}}
-                    // expected-warning@#2{{{does not allow a decomposition group to be empty}}}
+                    // expected-warning@#2{{{does not allow a structured binding group to be empty}}}
   auto [a, ..., b] = arr; // #3
                           // expected-error@#3{{expected identifier}}
                           // expected-error@#3{{{only 1 name was provided}}}
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index e2f4bcb..dac3649 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2529,9 +2529,12 @@
 // RUN: %clang -march=wildcatlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_NKL_M32
+// RUN: %clang -march=novalake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_NVL_M32,CHECK_NKL_M32
 // RUN: %clang -march=clearwaterforest -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_CWF_M32,CHECK_NKL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_NVL_M32,CHECK_UMSR_M32,CHECK_NKL_M32
 // CHECK_ARL_M32: #define __ADX__ 1
 // CHECK_ARL_M32: #define __AES__ 1
 // CHECK_ARL_M32: #define __AVX2__ 1
@@ -2571,7 +2574,7 @@
 // CHECK_ARL_M32: #define __POPCNT__ 1
 // CHECK_ARL_M32-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M32-NOT: #define __PREFETCHI__ 1
-// CHECK_CWF_M32: #define __PREFETCHI__ 1
+// CHECK_NVL_M32: #define __PREFETCHI__ 1
 // CHECK_ARL_M32: #define __PRFCHW__ 1
 // CHECK_ARL_M32: #define __PTWRITE__ 1
 // CHECK_ARL_M32-NOT: #define __RAOINT__ 1
@@ -2598,7 +2601,7 @@
 // CHECK_ARL_M32: #define __UINTR__ 1
 // CHECK_ARL_M32-NOT: #define __USERMSR__ 1
 // CHECK_ARLS_M32-NOT: #define __USERMSR__ 1
-// CHECK_CWF_M32: #define __USERMSR__ 1
+// CHECK_UMSR_M32: #define __USERMSR__ 1
 // CHECK_ARL_M32: #define __VAES__ 1
 // CHECK_ARL_M32: #define __VPCLMULQDQ__ 1
 // CHECK_ARL_M32: #define __WAITPKG__ 1
@@ -2636,9 +2639,12 @@
 // RUN: %clang -march=wildcatlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_NKL_M64
+// RUN: %clang -march=novalake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_NVL_M64,CHECK_NKL_M64
 // RUN: %clang -march=clearwaterforest -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_CWF_M64,CHECK_NKL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_NVL_M64,CHECK_UMSR_M64,CHECK_NKL_M64
 // CHECK_ARL_M64: #define __ADX__ 1
 // CHECK_ARL_M64: #define __AES__ 1
 // CHECK_ARL_M64: #define __AVX2__ 1
@@ -2678,7 +2684,7 @@
 // CHECK_ARL_M64: #define __POPCNT__ 1
 // CHECK_ARL_M64-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M64-NOT: #define __PREFETCHI__ 1
-// CHECK_CWF_M64: #define __PREFETCHI__ 1
+// CHECK_NVL_M64: #define __PREFETCHI__ 1
 // CHECK_ARL_M64: #define __PRFCHW__ 1
 // CHECK_ARL_M64: #define __PTWRITE__ 1
 // CHECK_ARL_M64-NOT: #define __RAOINT__ 1
@@ -2706,7 +2712,7 @@
 // CHECK_ARL_M64: #define __UINTR__ 1
 // CHECK_ARL_M64-NOT: #define __USERMSR__ 1
 // CHECK_ARLS_M64-NOT: #define __USERMSR__ 1
-// CHECK_CWF_M64: #define __USERMSR__ 1
+// CHECK_UMSR_M64: #define __USERMSR__ 1
 // CHECK_ARL_M64: #define __VAES__ 1
 // CHECK_ARL_M64: #define __VPCLMULQDQ__ 1
 // CHECK_ARL_M64: #define __WAITPKG__ 1
diff --git a/clang/test/Sema/attr-cpuspecific-cpus.c b/clang/test/Sema/attr-cpuspecific-cpus.c
index 0874d0c..2360d86 100644
--- a/clang/test/Sema/attr-cpuspecific-cpus.c
+++ b/clang/test/Sema/attr-cpuspecific-cpus.c
@@ -88,3 +88,4 @@ ATTR(cpu_specific(gracemont)) void CPU38(void){}
 ATTR(cpu_specific(pantherlake)) void CPU39(void){}
 ATTR(cpu_specific(clearwaterforest)) void CPU40(void){}
 ATTR(cpu_specific(wildcatlake)) void CPU41(void){}
+ATTR(cpu_specific(novalake)) void CPU42(void){}
diff --git a/clang/test/SemaCXX/builtin-structured-binding-size.cpp b/clang/test/SemaCXX/builtin-structured-binding-size.cpp
index bcd13a6..a8f6705 100644
--- a/clang/test/SemaCXX/builtin-structured-binding-size.cpp
+++ b/clang/test/SemaCXX/builtin-structured-binding-size.cpp
@@ -40,8 +40,8 @@ static_assert(__builtin_structured_binding_size(S5) == 2);
 // expected-error@-1 {{static assertion failed due to requirement '__builtin_structured_binding_size(S5) == 2'}} \
 // expected-note@-1 {{expression evaluates to '1 == 2'}}
 static_assert(__builtin_structured_binding_size(S6) == 2);
-// expected-error@-1 {{cannot decompose class type 'S6' because it has an anonymous union member}} \
-// expected-error@-1 {{type 'S6' cannot be decomposed}} \
+// expected-error@-1 {{cannot bind class type 'S6' because it has an anonymous union member}} \
+// expected-error@-1 {{type 'S6' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}} \
 // expected-note@#note-anon-union {{declared here}}
 static_assert(__builtin_structured_binding_size(S7) == 1);
@@ -49,15 +49,15 @@ static_assert(__builtin_structured_binding_size(S7) == 1);
 
 static_assert(__builtin_structured_binding_size(SD) == 1);
 static_assert(__builtin_structured_binding_size(SE1) == 1);
-// expected-error@-1 {{cannot decompose class type 'SE1': both it and its base class 'S1' have non-static data members}} \
-// expected-error@-1 {{type 'SE1' cannot be decomposed}} \
+// expected-error@-1 {{cannot bind class type 'SE1': both it and its base class 'S1' have non-static data members}} \
+// expected-error@-1 {{type 'SE1' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 
 static_assert(__builtin_structured_binding_size(U1) == 0);
-// expected-error@-1 {{type 'U1' cannot be decomposed}} \
+// expected-error@-1 {{type 'U1' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(__builtin_structured_binding_size(U2) == 0);
-// expected-error@-1 {{type 'U2' cannot be decomposed}} \
+// expected-error@-1 {{type 'U2' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 
 
@@ -75,7 +75,7 @@ static_assert(__builtin_structured_binding_size(decltype(__builtin_complex(0., 0
 
 int VLASize; // expected-note {{declared here}}
 static_assert(__builtin_structured_binding_size(int[VLASize]) == 42);
-// expected-error@-1 {{type 'int[VLASize]' cannot be decomposed}} \
+// expected-error@-1 {{type 'int[VLASize]' cannot be bound}} \
 // expected-warning@-1 {{variable length arrays in C++ are a Clang extension}} \
 // expected-note@-1 {{read of non-const variable 'VLASize' is not allowed in a constant expression}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
@@ -84,10 +84,10 @@ static_assert(__builtin_structured_binding_size(int[VLASize]) == 42);
 struct Incomplete; // expected-note {{forward declaration of 'Incomplete'}}
 static_assert(__builtin_structured_binding_size(Incomplete) == 1);
 // expected-error@-1 {{incomplete type 'Incomplete' where a complete type is required}} \
-// expected-error@-1 {{type 'Incomplete' cannot be decomposed}} \
+// expected-error@-1 {{type 'Incomplete' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(__builtin_structured_binding_size(Incomplete[]) == 1);
-// expected-error@-1 {{type 'Incomplete[]' cannot be decomposed}} \
+// expected-error@-1 {{type 'Incomplete[]' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(__builtin_structured_binding_size(Incomplete[0]) == 0);
 static_assert(__builtin_structured_binding_size(Incomplete[1]) == 1);
@@ -97,12 +97,12 @@ static_assert(__builtin_structured_binding_size(Incomplete[42]) == 42);
 static_assert(__builtin_structured_binding_size(P1) == 0);
 // expected-error@-1 {{static assertion failed due to requirement '__builtin_structured_binding_size(P1) == 0'}} \
 // expected-note@-1 {{expression evaluates to '1 == 0'}} \
-// expected-error@-1 {{cannot decompose private member 'a' of 'P1}} \
+// expected-error@-1 {{cannot bind private member 'a' of 'P1}} \
 // expected-note@#note-private {{implicitly declared private here}}
 
 
 void func(int array[14], int x = __builtin_structured_binding_size(decltype(array)));
-//expected-error@-1 {{type 'decltype(array)' (aka 'int *') cannot be decomposed}}
+//expected-error@-1 {{type 'decltype(array)' (aka 'int *') cannot be bound}}
 
 struct SM {
     static int array[14];
@@ -115,7 +115,7 @@ struct T {
 };
 
 T<int> t1;
-// expected-error@#tpl-1 {{type 'int' cannot be decomposed}} \
+// expected-error@#tpl-1 {{type 'int' cannot be bound}} \
 // expected-error@#tpl-1 {{non-type template argument is not a constant expression}} \
 // expected-note@-1 {{in instantiation of default argument for 'T<int>' required here}} \
 // expected-note@-1 {{while checking a default template argument used here}} \
@@ -183,8 +183,8 @@ static_assert(!is_destructurable<T0&>);
 static_assert(__builtin_structured_binding_size(T1) == 1);
 static_assert(__builtin_structured_binding_size(T42) == 42);
 static_assert(__builtin_structured_binding_size(TSizeError) == 42);
-// expected-error@-1 {{cannot decompose this type; 'std::tuple_size<TSizeError>::value' is not a valid integral constant expression}} \
-// expected-error@-1 {{type 'TSizeError' cannot be decomposed}} \
+// expected-error@-1 {{cannot bind this type; 'std::tuple_size<TSizeError>::value' is not a valid integral constant expression}} \
+// expected-error@-1 {{type 'TSizeError' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(!is_destructurable<TSizeError>);
 }
@@ -195,7 +195,7 @@ struct S {
   int y;
   static_assert(__builtin_structured_binding_size(S) == 2);
   //expected-error@-1 {{incomplete type 'S' where a complete type is required}} \
-  // expected-error@-1 {{type 'S' cannot be decomposed}} \
+  // expected-error@-1 {{type 'S' cannot be bound}} \
   // expected-error@-1 {{static assertion expression is not an integral constant expression}} \
   // expected-note@-4 {{definition of 'S' is not complete until the closing '}'}}
 };
@@ -228,20 +228,20 @@ static_assert(__is_same_as(tag_of_t<S1>, int));
 
 static_assert(__is_same_as(tag_of_t<int>, int)); // error
 // expected-error@-1 {{constraints not satisfied for alias template 'tag_of_t' [with T = int]}}
-// expected-note@#tag-of-constr {{because substituted constraint expression is ill-formed: type 'int' cannot be decomposed}}
+// expected-note@#tag-of-constr {{because substituted constraint expression is ill-formed: type 'int' cannot be bound}}
 
 struct MinusOne;
 template <> struct ::std::tuple_size<MinusOne> {
   static constexpr int value = -1;
 };
 int minus_one = __builtin_structured_binding_size(MinusOne);
-// expected-error@-1 {{cannot decompose this type; 'std::tuple_size<MinusOne>::value' is not a valid size: -1}}
-// expected-error@-2 {{type 'MinusOne' cannot be decomposed}}
+// expected-error@-1 {{cannot bind this type; 'std::tuple_size<MinusOne>::value' is not a valid size: -1}}
+// expected-error@-2 {{type 'MinusOne' cannot be bound}}
 
 struct UintMax;
 template <> struct ::std::tuple_size<UintMax> {
   static constexpr unsigned value = -1;
 };
 int uint_max = __builtin_structured_binding_size(UintMax);
-// expected-error@-1 {{cannot decompose this type; 'std::tuple_size<UintMax>::value' is not a valid size: 4294967295}}
-// expected-error@-2 {{type 'UintMax' cannot be decomposed}}
+// expected-error@-1 {{cannot bind this type; 'std::tuple_size<UintMax>::value' is not a valid size: 4294967295}}
+// expected-error@-2 {{type 'UintMax' cannot be bound}}
diff --git a/clang/test/SemaCXX/cxx17-compat.cpp b/clang/test/SemaCXX/cxx17-compat.cpp
index 99e41d8..1c9060d 100644
--- a/clang/test/SemaCXX/cxx17-compat.cpp
+++ b/clang/test/SemaCXX/cxx17-compat.cpp
@@ -76,18 +76,18 @@ struct ConstexprVirtual {
 struct C { int x, y, z; };
 static auto [cx, cy, cz] = C();
 #if __cplusplus <= 201703L
-    // expected-warning@-2 {{decomposition declaration declared 'static' is a C++20 extension}}
+    // expected-warning@-2 {{structured binding declaration declared 'static' is a C++20 extension}}
 #else
-    // expected-warning@-4 {{decomposition declaration declared 'static' is incompatible with C++ standards before C++20}}
+    // expected-warning@-4 {{structured binding declaration declared 'static' is incompatible with C++ standards before C++20}}
 #endif
 void f() {
   static thread_local auto [cx, cy, cz] = C();
 #if __cplusplus <= 201703L
-    // expected-warning@-2 {{decomposition declaration declared 'static' is a C++20 extension}}
-    // expected-warning@-3 {{decomposition declaration declared 'thread_local' is a C++20 extension}}
+    // expected-warning@-2 {{structured binding declaration declared 'static' is a C++20 extension}}
+    // expected-warning@-3 {{structured binding declaration declared 'thread_local' is a C++20 extension}}
 #else
-    // expected-warning@-5 {{decomposition declaration declared 'static' is incompatible with C++ standards before C++20}}
-    // expected-warning@-6 {{decomposition declaration declared 'thread_local' is incompatible with C++ standards before C++20}}
+    // expected-warning@-5 {{structured binding declaration declared 'static' is incompatible with C++ standards before C++20}}
+    // expected-warning@-6 {{structured binding declaration declared 'thread_local' is incompatible with C++ standards before C++20}}
 #endif
 }
 
diff --git a/clang/test/SemaCXX/cxx1z-decomposition.cpp b/clang/test/SemaCXX/cxx1z-decomposition.cpp
index 6ee1249..158a3a6 100644
--- a/clang/test/SemaCXX/cxx1z-decomposition.cpp
+++ b/clang/test/SemaCXX/cxx1z-decomposition.cpp
@@ -3,22 +3,22 @@
 // RUN: %clang_cc1 -std=c++20 -Wpre-c++20-compat -fexperimental-new-constant-interpreter -verify=expected %s
 
 void use_from_own_init() {
-  auto [a] = a; // expected-error {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}
+  auto [a] = a; // expected-error {{binding 'a' cannot appear in the initializer of its own structured binding declaration}}
 }
 
 void num_elems() {
   struct A0 {} a0;
   int a1[1], a2[2];
 
-  auto [] = a0; // expected-warning {{does not allow a decomposition group to be empty}}
-  auto [v1] = a0; // expected-error {{type 'struct A0' decomposes into 0 elements, but 1 name was provided}}
-  auto [] = a1; // expected-error {{type 'int[1]' decomposes into 1 element, but no names were provided}} expected-warning {{empty}}
+  auto [] = a0; // expected-warning {{does not allow a structured binding group to be empty}}
+  auto [v1] = a0; // expected-error {{type 'struct A0' binds to 0 elements, but 1 name was provided}}
+  auto [] = a1; // expected-error {{type 'int[1]' binds to 1 element, but no names were provided}} expected-warning {{empty}}
   auto [v2] = a1;
-  auto [v3, v4] = a1; // expected-error {{type 'int[1]' decomposes into 1 element, but 2 names were provided}}
-  auto [] = a2; // expected-error {{type 'int[2]' decomposes into 2 elements, but no names were provided}} expected-warning {{empty}}
-  auto [v5] = a2; // expected-error {{type 'int[2]' decomposes into 2 elements, but only 1 name was provided}}
+  auto [v3, v4] = a1; // expected-error {{type 'int[1]' binds to 1 element, but 2 names were provided}}
+  auto [] = a2; // expected-error {{type 'int[2]' binds to 2 elements, but no names were provided}} expected-warning {{empty}}
+  auto [v5] = a2; // expected-error {{type 'int[2]' binds to 2 elements, but only 1 name was provided}}
   auto [v6, v7] = a2;
-  auto [v8, v9, v10] = a2; // expected-error {{type 'int[2]' decomposes into 2 elements, but 3 names were provided}}
+  auto [v8, v9, v10] = a2; // expected-error {{type 'int[2]' binds to 2 elements, but 3 names were provided}}
 }
 
 // As a Clang extension, _Complex can be decomposed.
@@ -105,7 +105,7 @@ void enclosing() {
 void bitfield() {
   struct { int a : 3, : 4, b : 5; } a;
   auto &[x, y] = a;
-  auto &[p, q, r] = a; // expected-error-re {{type 'struct (unnamed struct at {{.*}})' decomposes into 2 elements, but 3 names were provided}}
+  auto &[p, q, r] = a; // expected-error-re {{type 'struct (unnamed struct at {{.*}})' binds to 2 elements, but 3 names were provided}}
 }
 
 void for_range() {
@@ -115,7 +115,7 @@ void for_range() {
   }
 
   int y[5];
-  for (auto[c] : y) { // expected-error {{cannot decompose non-class, non-array type 'int'}}
+  for (auto[c] : y) { // expected-error {{cannot bind non-class, non-array type 'int'}}
     c++;
   }
 }
@@ -157,16 +157,16 @@ int f2() {
 namespace lambdas {
   void f() {
     int n;
-    auto [a] =  // expected-error {{cannot decompose lambda closure type}}
+    auto [a] =  // expected-error {{cannot bind lambda closure type}}
         [n] {}; // expected-note {{lambda expression}}
   }
 
-  auto [] = []{}; // expected-warning {{ISO C++17 does not allow a decomposition group to be empty}}
+  auto [] = []{}; // expected-warning {{ISO C++17 does not allow a structured binding group to be empty}}
 
   int g() {
     int n = 0;
     auto a = [=](auto &self) { // expected-note {{lambda expression}}
-      auto &[capture] = self; // expected-error {{cannot decompose lambda closure type}}
+      auto &[capture] = self; // expected-error {{cannot bind lambda closure type}}
       ++capture;
       return n;
     };
@@ -188,14 +188,14 @@ namespace lambdas {
     struct A : decltype(x) {
       int n;
     };
-    auto &&[r] = A{x, 0}; // expected-error-re {{cannot decompose class type 'A': both it and its base class 'decltype(x)' (aka '(lambda {{.*}})') have non-static data members}}
+    auto &&[r] = A{x, 0}; // expected-error-re {{cannot bind class type 'A': both it and its base class 'decltype(x)' (aka '(lambda {{.*}})') have non-static data members}}
     return r;
   }
 
   void j() {
     auto x = [] {};
     struct A : decltype(x) {};
-    auto &&[] = A{x}; // expected-warning {{ISO C++17 does not allow a decomposition group to be empty}}
+    auto &&[] = A{x}; // expected-warning {{ISO C++17 does not allow a structured binding group to be empty}}
   }
 }
 
diff --git a/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp b/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp
index 638a2d8..0dfb52b 100644
--- a/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp
+++ b/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp
@@ -10,7 +10,7 @@ void decompose_array() {
   auto [x, ...rest, y] = arr;
 
   // cxx26-warning@+4 {{structured binding packs are incompatible with C++ standards before C++2c}}
-  // cxx23-error@+3 {{decomposition declaration cannot be declared 'constexpr'}}
+  // cxx23-error@+3 {{structured binding declaration cannot be declared 'constexpr'}}
   // cxx23-warning@+2 {{structured binding packs are a C++2c extension}}
   // nontemplate-error@+1 {{pack declaration outside of template}}
   constexpr auto [x_c, ...rest_c, y_c] = arr;
diff --git a/clang/test/SemaCXX/cxx2c-binding-pack.cpp b/clang/test/SemaCXX/cxx2c-binding-pack.cpp
index a8c1386..0b0eb88 100644
--- a/clang/test/SemaCXX/cxx2c-binding-pack.cpp
+++ b/clang/test/SemaCXX/cxx2c-binding-pack.cpp
@@ -82,7 +82,7 @@ void decompose_array() {
   static_assert(sizeof...(b) == 0);
   auto [...c] = arr1;
   static_assert(sizeof...(c) == 1);
-  auto [a1, ...b1, c1] = arr1; // expected-error{{decomposes into 1 element, but 3 names were provided}}
+  auto [a1, ...b1, c1] = arr1; // expected-error{{binds to 1 element, but 3 names were provided}}
 }
 
 // Test case by Younan Zhang.
@@ -160,7 +160,7 @@ void now_i_know_my() {
   static_assert(sizeof...(e) == 2);
   auto [h, i, j, ...k] = C(); // OK, the pack k is empty
   static_assert(sizeof...(e) == 0);
-  auto [l, m, n, o, ...p] = C(); // expected-error{{{decomposes into 3 elements, but 5 names were provided}}}
+  auto [l, m, n, o, ...p] = C(); // expected-error{{{binds to 3 elements, but 5 names were provided}}}
 }
 }  // namespace
 
@@ -225,7 +225,7 @@ namespace GH125165 {
 template <typename = void>
 auto f(auto t) {
     const auto& [...pack] = t;
-    // expected-error@-1 {{cannot decompose non-class, non-array type 'char const'}}
+    // expected-error@-1 {{cannot bind non-class, non-array type 'char const'}}
     (pack, ...);
 };
 
diff --git a/clang/test/SemaCXX/matrix-type.cpp b/clang/test/SemaCXX/matrix-type.cpp
index 186d3b6..0f9bff8 100644
--- a/clang/test/SemaCXX/matrix-type.cpp
+++ b/clang/test/SemaCXX/matrix-type.cpp
@@ -14,6 +14,7 @@ void matrix_var_dimensions(int Rows, unsigned Columns, char C) {
   using matrix7_t = int __attribute__((matrix_type(1, 0)));       // expected-error{{zero matrix size}}
   using matrix7_t = int __attribute__((matrix_type(char, 0)));    // expected-error{{expected '(' for function-style cast or type construction}}
   using matrix8_t = int __attribute__((matrix_type(1048576, 1))); // expected-error{{matrix row size too large}}
+  using matrix8_t = int __attribute__((matrix_type(1048576, 1048576))); // expected-error{{matrix row and column size too large}}
 }
 
 struct S1 {};
diff --git a/clang/test/SemaCXX/sizeless-1.cpp b/clang/test/SemaCXX/sizeless-1.cpp
index 688bbf0..ef3eec9 100644
--- a/clang/test/SemaCXX/sizeless-1.cpp
+++ b/clang/test/SemaCXX/sizeless-1.cpp
@@ -532,7 +532,7 @@ void cxx_only(int sel) {
   auto auto_int8 = local_int8;
   auto auto_int16 = local_int16;
 #if __cplusplus >= 201703L
-  auto [auto_int8_a] = local_int8; // expected-error {{cannot decompose non-class, non-array type 'svint8_t' (aka '__SVInt8_t')}}
+  auto [auto_int8_a] = local_int8; // expected-error {{cannot bind non-class, non-array type 'svint8_t' (aka '__SVInt8_t')}}
 #endif
 #endif
 
diff --git a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
index 2388c5f..0042ef0 100644
--- a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
+++ b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
@@ -259,11 +259,11 @@ struct S {
 
 int foo() {
 #ifdef AVOID
-  auto [a] = S{0}; // cxx14-warning {{decomposition declarations are a C++17 extension}}
+  auto [a] = S{0}; // cxx14-warning {{structured binding declarations are a C++17 extension}}
   [a = a] () { // No warning with basic -Wshadow due to uncaptured-local classification
   }();
 #else
-  auto [a] = S{0}; // cxx14-warning {{decomposition declarations are a C++17 extension}} expected-note {{previous declaration is here}}
+  auto [a] = S{0}; // cxx14-warning {{structured binding declarations are a C++17 extension}} expected-note {{previous declaration is here}}
   [a = a] () { // expected-warning {{declaration shadows a structured binding}}
   }();
 #endif
diff --git a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
index 6f6b01b..5ad1d6a 100644
--- a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
@@ -10,3 +10,26 @@ uint16_t4x4 mat2;
 matrix<int, 5, 5> mat3;
 // expected-error@-1 {{constraints not satisfied for alias template 'matrix' [with element = int, rows_count = 5, cols_count = 5]}}
 // expected-note@* {{because '5 <= 4' (5 <= 4) evaluated to false}}
+
+using float8x4 = __attribute__((matrix_type(8,4))) float;
+// expected-error@-1 {{matrix row size too large}}
+
+using float4x8 = __attribute__((matrix_type(4,8))) float;
+// expected-error@-1 {{matrix column size too large}}
+
+using float8x8 = __attribute__((matrix_type(8,8))) float;
+// expected-error@-1 {{matrix row and column size too large}}
+
+using floatNeg1x4 = __attribute__((matrix_type(-1,4))) float;
+// expected-error@-1 {{matrix row size too large}}
+using float4xNeg1 = __attribute__((matrix_type(4,-1))) float;
+// expected-error@-1 {{matrix column size too large}}
+using floatNeg1xNeg1 = __attribute__((matrix_type(-1,-1))) float; 
+// expected-error@-1 {{matrix row and column size too large}}
+
+using float0x4 = __attribute__((matrix_type(0,4))) float;
+// expected-error@-1 {{zero matrix size}}
+using float4x0 = __attribute__((matrix_type(4,0))) float;
+// expected-error@-1 {{zero matrix size}}
+using float0x0 = __attribute__((matrix_type(0,0))) float;
+// expected-error@-1 {{zero matrix size}}
diff --git a/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl b/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl
index e5a851c..623fca0 100644
--- a/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl
+++ b/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl
@@ -3,8 +3,9 @@
 // splat from vec1 to vec
 // CHECK-LABEL: call1
 // CHECK: CStyleCastExpr {{.*}} 'int3':'vector<int, 3>' <HLSLAggregateSplatCast>
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' lvalue <FloatingToIntegral> part_of_explicit_cast
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' lvalue <HLSLVectorTruncation> part_of_explicit_cast
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <FloatingToIntegral> part_of_explicit_cast
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <HLSLVectorTruncation> part_of_explicit_cast
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float1':'vector<float, 1>' <LValueToRValue> part_of_explicit_cast
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float1':'vector<float, 1>' lvalue Var {{.*}} 'A' 'float1':'vector<float, 1>'
 export void call1() {
   float1 A = {1.0};
@@ -21,7 +22,7 @@ struct S {
 // splat from scalar to aggregate
 // CHECK-LABEL: call2
 // CHECK: CStyleCastExpr {{.*}} 'S' <HLSLAggregateSplatCast>
-// CHECK-NEXt: IntegerLiteral {{.*}} 'int' 5
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
 export void call2() {
   S s = (S)5; 
 }
diff --git a/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl
index 563d3f0..8481cfc 100644
--- a/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl
+++ b/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl
@@ -21,3 +21,27 @@ export void call2() {
   float B[1] = {1.0};
   B = (float[1])A;
 }
+
+struct S {
+  int A;
+  float F;
+};
+
+// cast from a struct
+// CHECK-LABEL: call3
+// CHECK: CStyleCastExpr {{.*}} 'int[2]' <HLSLElementwiseCast>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'S' lvalue Var {{.*}} 'SS' 'S'
+export void call3() {
+  S SS = {1,1.0};
+  int A[2] = (int[2])SS;
+}
+
+// cast from a vector
+// CHECK-LABEL: call4
+// CHECK: CStyleCastExpr {{.*}} 'float[3]' <HLSLElementwiseCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int3':'vector<int, 3>' <LValueToRValue> part_of_explicit_cast
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int3':'vector<int, 3>' lvalue Var {{.*}} 'A' 'int3'
+export void call4() {
+  int3 A = {1,2,3};
+  float B[3] = (float[3])A;
+}
diff --git a/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl b/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl
index 2d6252c..3365dbe 100644
--- a/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl
+++ b/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl
@@ -195,6 +195,81 @@ T buzz(int X, T Y) {
   return X + Y;
 }
 
+// Case 4: Verify that the parameter modifier attributes are instantiated 
+// for both templated and non-templated arguments, and that the non-templated
+// out argument type is not modified by the template instantiation.
+
+// CHECK-LABEL: FunctionTemplateDecl {{.*}} fizz_two
+
+// Check the pattern decl.
+// CHECK: FunctionDecl {{.*}} fizz_two 'void (inout T, out int)'
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced V 'T'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+
+// Check the 3 instantiations (int, float, & double).
+
+// CHECK-LABEL: FunctionDecl {{.*}} used fizz_two 'void (inout int, out int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used V 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK: ParmVarDecl {{.*}} used I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+
+// CHECK-LABEL: FunctionDecl {{.*}} used fizz_two 'void (inout float, out int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used V 'float &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK: ParmVarDecl {{.*}} used I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+
+// CHECK-LABEL: FunctionDecl {{.*}} used fizz_two 'void (inout double, out int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used V 'double &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK: ParmVarDecl {{.*}} used I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+template <typename T>
+void fizz_two(inout T V, out int I) {
+  V += 2;
+  I = V;
+}
+
+// Case 5: Verify that `in` parameter modifier attributes are instantiated 
+// for both templated and non-templated arguments and argument types are not
+// modified
+
+// CHECK-LABEL: FunctionTemplateDecl {{.*}} buzz_two
+
+// Check the pattern decl.
+// CHECK: FunctionDecl {{.*}} buzz_two 'int (T, int)'
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced A 'T'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+
+// Check the 3 instantiations (int, float, & double).
+
+// CHECK-LABEL: FunctionDecl {{.*}} used buzz_two 'int (int, int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used A 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK: ParmVarDecl {{.*}} used B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+
+// CHECK-LABEL: FunctionDecl {{.*}} used buzz_two 'int (float, int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used A 'float'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK: ParmVarDecl {{.*}} used B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+
+// CHECK-LABEL: FunctionDecl {{.*}} used buzz_two 'int (double, int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used A 'double'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK: ParmVarDecl {{.*}} used B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+template <typename T>
+int buzz_two(in T A, in int B) {
+  return A + B;
+}
+
 export void caller() {
   int X = 2;
   float Y = 3.3;
@@ -211,4 +286,12 @@ export void caller() {
   X = buzz(X, X);
   Y = buzz(X, Y);
   Z = buzz(X, Z);
+
+  fizz_two(X, X);
+  fizz_two(Y, X);
+  fizz_two(Z, X);
+
+  X = buzz_two(X, X);
+  X = buzz_two(Y, X);
+  X = buzz_two(Z, X);
 }
diff --git a/clang/test/SemaTemplate/cxx1z-decomposition.cpp b/clang/test/SemaTemplate/cxx1z-decomposition.cpp
index 779c4cf..b0cc4e6 100644
--- a/clang/test/SemaTemplate/cxx1z-decomposition.cpp
+++ b/clang/test/SemaTemplate/cxx1z-decomposition.cpp
@@ -15,7 +15,7 @@ namespace std {
 template<> struct std::tuple_size<C> { enum { value = 2 }; };
 
 template<typename T> int decomp(T &t) { 
-  auto &[a, b] = t; // expected-error {{type 'D' decomposes into 3 elements, but only 2 names were provided}}
+  auto &[a, b] = t; // expected-error {{type 'D' binds to 3 elements, but only 2 names were provided}}
   return a + b; // expected-error {{cannot initialize return object of type 'int' with an rvalue of type 'int *'}}
 }
 
diff --git a/clang/unittests/Frontend/CompilerInvocationTest.cpp b/clang/unittests/Frontend/CompilerInvocationTest.cpp
index 75390aa..1332422 100644
--- a/clang/unittests/Frontend/CompilerInvocationTest.cpp
+++ b/clang/unittests/Frontend/CompilerInvocationTest.cpp
@@ -732,6 +732,26 @@ TEST_F(CommandLineTest, ConditionalParsingIfTrueFlagPresent) {
   ASSERT_THAT(GeneratedArgs, Contains(StrEq("-sycl-std=2017")));
 }
 
+TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagPresent) {
+  const char *Args[] = {"-xhlsl"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 4u);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagNotPresent) {
+  const char *Args[] = {""};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 1048575u);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
 // Wide integer option.
 
 TEST_F(CommandLineTest, WideIntegerHighValue) {
diff --git a/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake
index 53aa750..2040001 100644
--- a/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake
@@ -34,12 +34,10 @@ macro(archive_aix_libatomic name libname)
       if(TARGET ${target})
         file(MAKE_DIRECTORY ${output_dir})
         add_custom_command(OUTPUT "${output_dir}/libatomic.so.1"
-                           POST_BUILD
                            COMMAND ${CMAKE_COMMAND} -E
                            copy "$<TARGET_FILE:${target}>"
                                 "${output_dir}/libatomic.so.1"
                            # If built with MODULE, F_LOADONLY is set.
-                           # We have to remove this flag at POST_BUILD.
                            COMMAND ${CMAKE_STRIP} -X32_64 -E
                                 "${output_dir}/libatomic.so.1"
                            DEPENDS ${target})
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index d91e13c..7ddfaa3 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -104,6 +104,7 @@ enum ProcessorSubtypes {
   INTEL_COREI7_PANTHERLAKE,
   AMDFAM1AH_ZNVER5,
   INTEL_COREI7_DIAMONDRAPIDS,
+  INTEL_COREI7_NOVALAKE,
   CPU_SUBTYPE_MAX
 };
 
@@ -646,6 +647,19 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
       break;
     }
     break;
+  case 0x12:
+    switch (Model) {
+    case 0x1:
+    case 0x3:
+      CPU = "novalake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_NOVALAKE;
+      break;
+    default: // Unknown family 0x12 CPU.
+      break;
+    }
+    break;
+
   default:
     break; // Unknown.
   }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index 3e82df4..ba85a0e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -390,6 +390,9 @@ void ReportDeadlySignal(const SignalContext &sig, u32 tid,
 void SetAlternateSignalStack();
 void UnsetAlternateSignalStack();
 
+bool IsSignalHandlerFromSanitizer(int signum);
+bool SetSignalHandlerFromSanitizer(int signum, bool new_state);
+
 // Construct a one-line string:
 //   SUMMARY: SanitizerToolName: error_message
 // and pass it to __sanitizer_report_error_summary.
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc b/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
index 650a458..5f44990 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
@@ -113,6 +113,11 @@ COMMON_FLAG(HandleSignalMode, handle_sigfpe, kHandleSignalYes,
 COMMON_FLAG(bool, allow_user_segv_handler, true,
             "Deprecated. True has no effect, use handle_sigbus=1. If false, "
             "handle_*=1 will be upgraded to handle_*=2.")
+COMMON_FLAG(bool, cloak_sanitizer_signal_handlers, false,
+            "If set, signal/sigaction will pretend that sanitizers did not "
+            "preinstall any signal handlers. If the user subsequently installs "
+            "a signal handler, this will disable cloaking for the respective "
+            "signal.")
 COMMON_FLAG(bool, use_sigaltstack, true,
             "If set, uses alternate stack for signal handling.")
 COMMON_FLAG(bool, detect_deadlocks, true,
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index b1eb200..8e5e879 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -47,6 +47,8 @@ typedef void (*sa_sigaction_t)(int, siginfo_t *, void *);
 
 namespace __sanitizer {
 
+[[maybe_unused]] static atomic_uint8_t signal_handler_is_from_sanitizer[64];
+
 u32 GetUid() {
   return getuid();
 }
@@ -210,6 +212,20 @@ void UnsetAlternateSignalStack() {
   UnmapOrDie(oldstack.ss_sp, oldstack.ss_size);
 }
 
+bool IsSignalHandlerFromSanitizer(int signum) {
+  return atomic_load(&signal_handler_is_from_sanitizer[signum],
+                     memory_order_relaxed);
+}
+
+bool SetSignalHandlerFromSanitizer(int signum, bool new_state) {
+  if (signum < 0 || static_cast<unsigned>(signum) >=
+                        ARRAY_SIZE(signal_handler_is_from_sanitizer))
+    return false;
+
+  return atomic_exchange(&signal_handler_is_from_sanitizer[signum], new_state,
+                         memory_order_relaxed);
+}
+
 static void MaybeInstallSigaction(int signum,
                                   SignalHandlerType handler) {
   if (GetHandleSignalMode(signum) == kHandleSignalNo) return;
@@ -223,6 +239,9 @@ static void MaybeInstallSigaction(int signum,
   if (common_flags()->use_sigaltstack) sigact.sa_flags |= SA_ONSTACK;
   CHECK_EQ(0, internal_sigaction(signum, &sigact, nullptr));
   VReport(1, "Installed the sigaction for signal %d\n", signum);
+
+  if (common_flags()->cloak_sanitizer_signal_handlers)
+    SetSignalHandlerFromSanitizer(signum, true);
 }
 
 void InstallDeadlySignalHandlers(SignalHandlerType handler) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc
index 94e4e29..8511e4d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc
@@ -45,6 +45,8 @@ using namespace __sanitizer;
 INTERCEPTOR(uptr, bsd_signal, int signum, uptr handler) {
   SIGNAL_INTERCEPTOR_ENTER();
   if (GetHandleSignalMode(signum) == kHandleSignalExclusive) return 0;
+
+  // TODO: support cloak_sanitizer_signal_handlers
   SIGNAL_INTERCEPTOR_SIGNAL_IMPL(bsd_signal, signum, handler);
 }
 #define INIT_BSD_SIGNAL COMMON_INTERCEPT_FUNCTION(bsd_signal)
@@ -56,19 +58,55 @@ INTERCEPTOR(uptr, bsd_signal, int signum, uptr handler) {
 INTERCEPTOR(uptr, signal, int signum, uptr handler) {
   SIGNAL_INTERCEPTOR_ENTER();
   if (GetHandleSignalMode(signum) == kHandleSignalExclusive)
+    // The user can neither view nor change the signal handler, regardless of
+    // the cloak_sanitizer_signal_handlers setting. This differs from
+    // sigaction().
     return (uptr) nullptr;
-  SIGNAL_INTERCEPTOR_SIGNAL_IMPL(signal, signum, handler);
+
+  uptr ret = +[](auto signal, int signum, uptr handler) {
+    SIGNAL_INTERCEPTOR_SIGNAL_IMPL(signal, signum, handler);
+  }(signal, signum, handler);
+
+  if (ret != sig_err && SetSignalHandlerFromSanitizer(signum, false))
+    // If the user sets a signal handler, it becomes uncloaked, even if they
+    // reuse a sanitizer's signal handler.
+    ret = sig_dfl;
+
+  return ret;
 }
 #define INIT_SIGNAL COMMON_INTERCEPT_FUNCTION(signal)
 
 INTERCEPTOR(int, sigaction_symname, int signum,
             const __sanitizer_sigaction *act, __sanitizer_sigaction *oldact) {
   SIGNAL_INTERCEPTOR_ENTER();
+
   if (GetHandleSignalMode(signum) == kHandleSignalExclusive) {
     if (!oldact) return 0;
     act = nullptr;
+    // If cloak_sanitizer_signal_handlers=true, the user can neither view nor
+    // change the signal handle.
+    // If false, the user can view but not change the signal handler. This
+    // differs from signal().
   }
-  SIGNAL_INTERCEPTOR_SIGACTION_IMPL(signum, act, oldact);
+
+  int ret = +[](int signum, const __sanitizer_sigaction* act,
+                __sanitizer_sigaction* oldact) {
+    SIGNAL_INTERCEPTOR_SIGACTION_IMPL(signum, act, oldact);
+  }(signum, act, oldact);
+
+  if (act) {
+    if (ret == 0 && SetSignalHandlerFromSanitizer(signum, false)) {
+      // If the user sets a signal handler, it becomes uncloaked, even if they
+      // reuse a sanitizer's signal handler.
+
+      if (oldact)
+        oldact->handler = reinterpret_cast<__sanitizer_sighandler_ptr>(sig_dfl);
+    }
+  } else if (ret == 0 && oldact && IsSignalHandlerFromSanitizer(signum)) {
+    oldact->handler = reinterpret_cast<__sanitizer_sighandler_ptr>(sig_dfl);
+  }
+
+  return ret;
 }
 #define INIT_SIGACTION COMMON_INTERCEPT_FUNCTION(sigaction_symname)
 
diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
index fd0b5a4..beab1dc 100644
--- a/compiler-rt/test/asan/TestCases/wcscat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscat.cpp
@@ -9,11 +9,13 @@
 int main() {
   const wchar_t *start = L"X means ";
   const wchar_t *append = L"dog";
-  wchar_t goodDst[12];
+  wchar_t goodArray[12];
+  wchar_t *volatile goodDst = goodArray;
   wcscpy(goodDst, start);
   wcscat(goodDst, append);
 
-  wchar_t badDst[9];
+  wchar_t badArray[9];
+  wchar_t *volatile badDst = badArray;
   wcscpy(badDst, start);
   fprintf(stderr, "Good so far.\n");
   // CHECK-DAG: Good so far.
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
index 8133a58..2b82803 100644
--- a/compiler-rt/test/asan/TestCases/wcscpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscpy.cpp
@@ -8,10 +8,12 @@
 
 int main() {
   const wchar_t *src = L"X means dog";
-  wchar_t goodDst[12];
+  wchar_t goodArray[12];
+  wchar_t *volatile goodDst = goodArray;
   wcscpy(goodDst, src);
 
-  wchar_t badDst[7];
+  wchar_t badArray[7];
+  wchar_t *volatile badDst = badArray;
   fprintf(stderr, "Good so far.\n");
   // CHECK-DAG: Good so far.
   fflush(stderr);
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
index 365e732..04cdcf2 100644
--- a/compiler-rt/test/asan/TestCases/wcsncat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncat.cpp
@@ -9,11 +9,13 @@
 int main() {
   const wchar_t *start = L"X means ";
   const wchar_t *append = L"dog";
-  wchar_t goodDst[15];
+  wchar_t goodArray[15];
+  wchar_t *volatile goodDst = goodArray;
   wcscpy(goodDst, start);
   wcsncat(goodDst, append, 5);
 
-  wchar_t badDst[11];
+  wchar_t badArray[11];
+  wchar_t *volatile badDst = badArray;
   wcscpy(badDst, start);
   wcsncat(badDst, append, 1);
   fprintf(stderr, "Good so far.\n");
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
index 485ddc4..9e11b55 100644
--- a/compiler-rt/test/asan/TestCases/wcsncpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
@@ -8,10 +8,12 @@
 
 int main() {
   const wchar_t *src = L"X means dog";
-  wchar_t goodDst[12];
+  wchar_t goodArray[12];
+  wchar_t *volatile goodDst = goodArray;
   wcsncpy(goodDst, src, 12);
 
-  wchar_t badDst[7];
+  wchar_t badArray[7];
+  wchar_t *volatile badDst = badArray;
   wcsncpy(badDst, src, 7); // This should still work.
   fprintf(stderr, "Good so far.\n");
   // CHECK-DAG: Good so far.
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp
index 1c74015..0c5a922 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp
@@ -23,6 +23,10 @@
 // Flaky errors in debuggerd with "waitpid returned unexpected pid (0)" in logcat.
 // UNSUPPORTED: android && i386-target-arch
 
+// Note: this test case is unusual because it retrieves the original
+// (ASan-installed) signal handler; thus, it is incompatible with the
+// cloak_sanitizer_signal_handlers runtime option.
+
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_sigaction.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_sigaction.cpp
new file mode 100644
index 0000000..422e4ab
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_sigaction.cpp
@@ -0,0 +1,53 @@
+// UNSUPPORTED: android
+// UNSUPPORTED: hwasan
+
+// RUN: %clangxx -O0 %s -o %t
+
+// Sanitizer signal handler not installed; custom signal handler installed
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed but overriden by custom signal handler
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=NONDEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed immutably
+// N.B. for handle_segv=2 with cloaking off, there is a pre-existing difference
+//      in signal vs. sigaction: signal effectively cloaks the handler.
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=NONDEFAULT,SANITIZER
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,SANITIZER
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void handler(int signum, siginfo_t *info, void *context) {
+  printf("Custom signal handler\n");
+  exit(1);
+}
+
+int main(int argc, char *argv[]) {
+  struct sigaction sa = {0};
+  struct sigaction old = {0};
+  sa.sa_flags = SA_SIGINFO;
+  sa.sa_sigaction = &handler;
+  sigaction(SIGSEGV, &sa, &old);
+
+  if (reinterpret_cast<void *>(old.sa_sigaction) == SIG_DFL)
+    printf("Old handler: default\n");
+  // DEFAULT: Old handler: default
+  else
+    printf("Old handler: non-default\n");
+  // NONDEFAULT: Old handler: non-default
+
+  fflush(stdout);
+
+  // Trying to organically segfault by dereferencing a pointer can be tricky
+  // in builds with assertions. Additionally, some older platforms may SIGBUS
+  // instead.
+  raise(SIGSEGV);
+  // CUSTOM: Custom signal handler
+  // SANITIZER: Sanitizer:DEADLYSIGNAL
+
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_signal.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_signal.cpp
new file mode 100644
index 0000000..48e5475
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_signal.cpp
@@ -0,0 +1,48 @@
+// UNSUPPORTED: android
+// UNSUPPORTED: hwasan
+
+// RUN: %clangxx -O0 %s -o %t
+
+// Sanitizer signal handler not installed; custom signal handler installed
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed but overriden by custom signal handler
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=NONDEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed immutably
+// N.B. for handle_segv=2 with cloaking off, there is a pre-existing difference
+//      in signal vs. sigaction: signal effectively cloaks the handler.
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,SANITIZER
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,SANITIZER
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void my_signal_sighandler(int signum) {
+  printf("Custom signal handler\n");
+  exit(1);
+}
+
+int main(int argc, char *argv[]) {
+  __sighandler_t old = signal(SIGSEGV, &my_signal_sighandler);
+  if (old == SIG_DFL)
+    printf("Old handler: default\n");
+  // DEFAULT: Old handler: default
+  else
+    printf("Old handler: non-default\n");
+  // NONDEFAULT: Old handler: non-default
+
+  fflush(stdout);
+
+  // Trying to organically segfault by dereferencing a pointer can be tricky
+  // in builds with assertions. Additionally, some older platforms may SIGBUS
+  // instead.
+  raise(SIGSEGV);
+  // CUSTOM: Custom signal handler
+  // SANITIZER: Sanitizer:DEADLYSIGNAL
+
+  return 0;
+}
diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
index ab383bc..92350776 100644
--- a/flang-rt/cmake/modules/AddFlangRT.cmake
+++ b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -231,6 +231,27 @@ function (add_flangrt_library name)
       target_compile_options(${tgtname} PRIVATE
           $<$<COMPILE_LANGUAGE:CXX>:-fno-exceptions -fno-rtti -funwind-tables -fno-asynchronous-unwind-tables>
         )
+
+      # We define our own _GLIBCXX_THROW_OR_ABORT here because, as of
+      # GCC 15.1, the libstdc++ header file <bits/c++config> uses
+      # (void)_EXC in its definition of _GLIBCXX_THROW_OR_ABORT to
+      # silence a warning.
+      #
+      # This is a problem for us because some compilers, specifically
+      # clang, do not always optimize away that (void)_EXC even though
+      # it is unreachable since it occurs after a call to
+      # _builtin_abort().  Because _EXC is typically an object derived
+      # from std::exception, (void)_EXC, when not optimized away,
+      # calls std::exception methods defined in the libstdc++ shared
+      # library.  We shouldn't link against that library since our
+      # build version may conflict with the version used by a hybrid
+      # Fortran/C++ application.
+      #
+      # Redefining _GLIBCXX_THROW_OR_ABORT in this manner is not
+      # supported by the maintainers of libstdc++, so future changes
+      # to libstdc++ may require future changes to this build script
+      # and/or future changes to the Fortran runtime source code.
+      target_compile_options(${tgtname} PUBLIC "-D_GLIBCXX_THROW_OR_ABORT(_EXC)=(__builtin_abort())")
     elseif (MSVC)
       target_compile_options(${tgtname} PRIVATE
           $<$<COMPILE_LANGUAGE:CXX>:/EHs-c- /GR->
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 20a0919..7f64d23 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1342,10 +1342,12 @@ inline bool IsCUDADataTransfer(const A &lhs, const B &rhs) {
   int rhsNbManagedSymbols = {GetNbOfCUDAManagedOrUnifiedSymbols(rhs)};
   int rhsNbSymbols{GetNbOfCUDADeviceSymbols(rhs)};
 
-  // Special case where only managed or unifed symbols are involved. This is
-  // performed on the host.
-  if (lhsNbManagedSymbols == 1 && rhsNbManagedSymbols == 1 &&
-      rhsNbSymbols == 1) {
+  // Special cases perforemd on the host:
+  // - Only managed or unifed symbols are involved on RHS and LHS.
+  // - LHS is managed or unified and the RHS is host only.
+  if ((lhsNbManagedSymbols == 1 && rhsNbManagedSymbols == 1 &&
+          rhsNbSymbols == 1) ||
+      (lhsNbManagedSymbols == 1 && rhsNbSymbols == 0)) {
     return false;
   }
   return HasCUDADeviceAttrs(lhs) || rhsNbSymbols > 0;
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index ab9dde8..ef7cdc42 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -27,6 +27,10 @@ class Location;
 class MLIRContext;
 } // namespace mlir
 
+namespace hlfir {
+class ElementalOp;
+} // namespace hlfir
+
 namespace Fortran::lower {
 
 class AbstractConverter;
@@ -58,7 +62,9 @@ cuf::DataAttributeAttr
 translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
                                 const Fortran::semantics::Symbol &sym);
 
-bool isTransferWithConversion(mlir::Value rhs);
+/// Check if the rhs has an implicit conversion. Return the elemental op if
+/// there is a conversion. Return null otherwise.
+hlfir::ElementalOp isTransferWithConversion(mlir::Value rhs);
 
 } // end namespace Fortran::lower
 
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 5cd196a..273e611 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -282,6 +282,7 @@ using Replayable = tomp::clause::ReplayableT<TypeTy, IdTy, ExprTy>;
 using ReverseOffload = tomp::clause::ReverseOffloadT<TypeTy, IdTy, ExprTy>;
 using Safelen = tomp::clause::SafelenT<TypeTy, IdTy, ExprTy>;
 using Schedule = tomp::clause::ScheduleT<TypeTy, IdTy, ExprTy>;
+using SelfMaps = tomp::clause::SelfMapsT<TypeTy, IdTy, ExprTy>;
 using SeqCst = tomp::clause::SeqCstT<TypeTy, IdTy, ExprTy>;
 using Severity = tomp::clause::SeverityT<TypeTy, IdTy, ExprTy>;
 using Shared = tomp::clause::SharedT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 91af92c0..3501790 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -568,6 +568,7 @@ public:
   NODE(OmpDoacross, Sink)
   NODE(OmpDoacross, Source)
   NODE(parser, OmpDoacrossClause)
+  NODE(parser, OmpDynamicAllocatorsClause)
   NODE(parser, OmpDynGroupprivateClause)
   NODE(OmpDynGroupprivateClause, Modifier)
   NODE(parser, OmpEndDirective)
@@ -661,9 +662,11 @@ public:
   NODE(parser, OmpRefModifier)
   NODE_ENUM(OmpRefModifier, Value)
   NODE(parser, OmpReplayableClause)
+  NODE(parser, OmpReverseOffloadClause)
   NODE(parser, OmpScheduleClause)
   NODE(OmpScheduleClause, Modifier)
   NODE_ENUM(OmpScheduleClause, Kind)
+  NODE(parser, OmpSelfMapsClause)
   NODE(parser, OmpSelfModifier)
   NODE_ENUM(OmpSelfModifier, Value)
   NODE(parser, OmpSeverityClause)
@@ -691,6 +694,8 @@ public:
   NODE(parser, OmpTransparentClause)
   NODE(parser, OmpTypeNameList)
   NODE(parser, OmpTypeSpecifier)
+  NODE(parser, OmpUnifiedAddressClause)
+  NODE(parser, OmpUnifiedSharedMemoryClause)
   NODE(parser, OmpUpdateClause)
   NODE(parser, OmpUseClause)
   NODE(parser, OmpVariableCategory)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index f52323c..bb3af9e 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -337,6 +337,7 @@ using IntConstantExpr = Integer<ConstantExpr>; // R1031
 using ScalarLogicalExpr = Scalar<LogicalExpr>;
 using ScalarIntExpr = Scalar<IntExpr>;
 using ScalarIntConstantExpr = Scalar<IntConstantExpr>;
+using ScalarLogicalConstantExpr = Scalar<Logical<ConstantExpr>>;
 using ScalarDefaultCharExpr = Scalar<DefaultCharExpr>;
 // R1030 default-char-constant-expr is used in the Standard only as part of
 // scalar-default-char-constant-expr.
@@ -4201,7 +4202,7 @@ struct OmpAffinityClause {
 
 // Ref: 5.2: [174]
 struct OmpAlignClause {
-  WRAPPER_CLASS_BOILERPLATE(OmpAlignClause, ScalarIntExpr);
+  WRAPPER_CLASS_BOILERPLATE(OmpAlignClause, ScalarIntConstantExpr);
 };
 
 // Ref: [4.5:72-81], [5.0:110-119], [5.1:134-143], [5.2:169-170]
@@ -4426,6 +4427,16 @@ struct OmpDeviceTypeClause {
   WRAPPER_CLASS_BOILERPLATE(OmpDeviceTypeClause, DeviceTypeDescription);
 };
 
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// dynamic-allocators-clause ->
+//    DYNAMIC_ALLOCATORS                            // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpDynamicAllocatorsClause {
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpDynamicAllocatorsClause, ScalarLogicalConstantExpr);
+};
+
 struct OmpDynGroupprivateClause {
   TUPLE_CLASS_BOILERPLATE(OmpDynGroupprivateClause);
   MODIFIER_BOILERPLATE(OmpAccessGroup, OmpPrescriptiveness);
@@ -4703,7 +4714,16 @@ struct OmpReductionClause {
 // replayable-clause ->
 //    REPLAYABLE[(replayable-expression)]           // since 6.0
 struct OmpReplayableClause {
-  WRAPPER_CLASS_BOILERPLATE(OmpReplayableClause, Scalar<Logical<ConstantExpr>>);
+  WRAPPER_CLASS_BOILERPLATE(OmpReplayableClause, ScalarLogicalConstantExpr);
+};
+
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// reverse-offload-clause ->
+//    REVERSE_OFFLOAD                               // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpReverseOffloadClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpReverseOffloadClause, ScalarLogicalConstantExpr);
 };
 
 // Ref: [4.5:56-63], [5.0:101-109], [5.1:126-133], [5.2:252-254]
@@ -4721,6 +4741,14 @@ struct OmpScheduleClause {
   std::tuple<MODIFIERS(), Kind, std::optional<ScalarIntExpr>> t;
 };
 
+// ref: [6.0:361-362]
+//
+// self-maps-clause ->
+//    SELF_MAPS [(scalar-logical-const-expr)]       // since 6.0
+struct OmpSelfMapsClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpSelfMapsClause, ScalarLogicalConstantExpr);
+};
+
 // REF: [5.2:217]
 // severity-clause ->
 //    SEVERITY(warning|fatal)
@@ -4763,6 +4791,25 @@ struct OmpTransparentClause {
   WRAPPER_CLASS_BOILERPLATE(OmpTransparentClause, ScalarIntExpr);
 };
 
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// unified-address-clause ->
+//    UNIFIED_ADDRESS                               // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpUnifiedAddressClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpUnifiedAddressClause, ScalarLogicalConstantExpr);
+};
+
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// unified-shared-memory-clause ->
+//    UNIFIED_SHARED_MEMORY                         // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpUnifiedSharedMemoryClause {
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpUnifiedSharedMemoryClause, ScalarLogicalConstantExpr);
+};
+
 // Ref: [5.0:254-255], [5.1:287-288], [5.2:321-322]
 //
 // In ATOMIC construct
diff --git a/flang/include/flang/Parser/tools.h b/flang/include/flang/Parser/tools.h
index 447bccd..a90c856 100644
--- a/flang/include/flang/Parser/tools.h
+++ b/flang/include/flang/Parser/tools.h
@@ -117,6 +117,12 @@ template <typename A, typename B> const A *Unwrap(const B &x) {
 template <typename A, typename B> A *Unwrap(B &x) {
   return const_cast<A *>(Unwrap<A, B>(const_cast<const B &>(x)));
 }
+template <typename A, typename B> const A &UnwrapRef(const B &x) {
+  return DEREF(Unwrap<A>(x));
+}
+template <typename A, typename B> A &UnwrapRef(B &x) {
+  return DEREF(Unwrap<A>(x));
+}
 
 // Get the CoindexedNamedObject if the entity is a coindexed object.
 const CoindexedNamedObject *GetCoindexedNamedObject(const AllocateObject &);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 68adf34..525fb0e 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4987,11 +4987,8 @@ private:
 
     // host = device
     if (!lhsIsDevice && rhsIsDevice) {
-      if (Fortran::lower::isTransferWithConversion(rhs)) {
+      if (auto elementalOp = Fortran::lower::isTransferWithConversion(rhs)) {
         mlir::OpBuilder::InsertionGuard insertionGuard(builder);
-        auto elementalOp =
-            mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp());
-        assert(elementalOp && "expect elemental op");
         auto designateOp =
             *elementalOp.getBody()->getOps<hlfir::DesignateOp>().begin();
         builder.setInsertionPoint(elementalOp);
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index bb4bdee..9501b0e 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -68,11 +68,26 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
   return cuf::getDataAttribute(mlirContext, cudaAttr);
 }
 
-bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
+hlfir::ElementalOp Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
+  auto isConversionElementalOp = [](hlfir::ElementalOp elOp) {
+    return llvm::hasSingleElement(
+               elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
+           llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
+           llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) ==
+               1;
+  };
+  if (auto declOp = mlir::dyn_cast<hlfir::DeclareOp>(rhs.getDefiningOp())) {
+    if (!declOp.getMemref().getDefiningOp())
+      return {};
+    if (auto associateOp = mlir::dyn_cast<hlfir::AssociateOp>(
+            declOp.getMemref().getDefiningOp()))
+      if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(
+              associateOp.getSource().getDefiningOp()))
+        if (isConversionElementalOp(elOp))
+          return elOp;
+  }
   if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp()))
-    if (llvm::hasSingleElement(elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
-        llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
-        llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) == 1)
-      return true;
-  return false;
+    if (isConversionElementalOp(elOp))
+      return elOp;
+  return {};
 }
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index 604b137..cd53dc9 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -950,7 +950,8 @@ static void genIoLoop(Fortran::lower::AbstractConverter &converter,
   makeNextConditionalOn(builder, loc, checkResult, ok, inLoop);
   const auto &itemList = std::get<0>(ioImpliedDo.t);
   const auto &control = std::get<1>(ioImpliedDo.t);
-  const auto &loopSym = *control.name.thing.thing.symbol;
+  const auto &loopSym =
+      *Fortran::parser::UnwrapRef<Fortran::parser::Name>(control.name).symbol;
   mlir::Value loopVar = fir::getBase(converter.genExprAddr(
       Fortran::evaluate::AsGenericExpr(loopSym).value(), stmtCtx));
   auto genControlValue = [&](const Fortran::parser::ScalarIntExpr &expr) {
diff --git a/flang/lib/Lower/OpenMP/Atomic.cpp b/flang/lib/Lower/OpenMP/Atomic.cpp
index ff82a36..3ab8a58 100644
--- a/flang/lib/Lower/OpenMP/Atomic.cpp
+++ b/flang/lib/Lower/OpenMP/Atomic.cpp
@@ -20,6 +20,7 @@
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/type.h"
 #include "flang/Support/Fortran.h"
@@ -183,12 +184,8 @@ getMemoryOrderFromRequires(const semantics::Scope &scope) {
   // scope.
   // For safety, traverse all enclosing scopes and check if their symbol
   // contains REQUIRES.
-  for (const auto *sc{&scope}; sc->kind() != semantics::Scope::Kind::Global;
-       sc = &sc->parent()) {
-    const semantics::Symbol *sym = sc->symbol();
-    if (!sym)
-      continue;
-
+  const semantics::Scope &unitScope = semantics::omp::GetProgramUnit(scope);
+  if (auto *symbol = unitScope.symbol()) {
     const common::OmpMemoryOrderType *admo = common::visit(
         [](auto &&s) {
           using WithOmpDeclarative = semantics::WithOmpDeclarative;
@@ -198,7 +195,8 @@ getMemoryOrderFromRequires(const semantics::Scope &scope) {
           }
           return static_cast<const common::OmpMemoryOrderType *>(nullptr);
         },
-        sym->details());
+        symbol->details());
+
     if (admo)
       return getMemoryOrderKind(*admo);
   }
@@ -214,19 +212,83 @@ getDefaultAtomicMemOrder(semantics::SemanticsContext &semaCtx) {
   return std::nullopt;
 }
 
-static std::optional<mlir::omp::ClauseMemoryOrderKind>
+static std::pair<std::optional<mlir::omp::ClauseMemoryOrderKind>, bool>
 getAtomicMemoryOrder(semantics::SemanticsContext &semaCtx,
                      const omp::List<omp::Clause> &clauses,
                      const semantics::Scope &scope) {
   for (const omp::Clause &clause : clauses) {
     if (auto maybeKind = getMemoryOrderKind(clause.id))
-      return *maybeKind;
+      return std::make_pair(*maybeKind, /*canOverride=*/false);
   }
 
   if (auto maybeKind = getMemoryOrderFromRequires(scope))
-    return *maybeKind;
+    return std::make_pair(*maybeKind, /*canOverride=*/true);
 
-  return getDefaultAtomicMemOrder(semaCtx);
+  return std::make_pair(getDefaultAtomicMemOrder(semaCtx),
+                        /*canOverride=*/false);
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+makeValidForAction(std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
+                   int action0, int action1, unsigned version) {
+  // When the atomic default memory order specified on a REQUIRES directive is
+  // disallowed on a given ATOMIC operation, and it's not ACQ_REL, the order
+  // reverts to RELAXED. ACQ_REL decays to either ACQUIRE or RELEASE, depending
+  // on the operation.
+
+  if (!memOrder) {
+    return memOrder;
+  }
+
+  using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+  // Figure out the main action (i.e. disregard a potential capture operation)
+  int action = action0;
+  if (action1 != Analysis::None)
+    action = action0 == Analysis::Read ? action1 : action0;
+
+  // Avaliable orderings: acquire, acq_rel, relaxed, release, seq_cst
+
+  if (action == Analysis::Read) {
+    // "acq_rel" decays to "acquire"
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel)
+      return mlir::omp::ClauseMemoryOrderKind::Acquire;
+  } else if (action == Analysis::Write) {
+    // "acq_rel" decays to "release"
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel)
+      return mlir::omp::ClauseMemoryOrderKind::Release;
+  }
+
+  if (version > 50) {
+    if (action == Analysis::Read) {
+      // "release" prohibited
+      if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Release)
+        return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+    }
+    if (action == Analysis::Write) {
+      // "acquire" prohibited
+      if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acquire)
+        return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+    }
+  } else {
+    if (action == Analysis::Read) {
+      // "release" prohibited
+      if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Release)
+        return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+    } else {
+      if (action & Analysis::Write) { // include "update"
+        // "acquire" prohibited
+        if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acquire)
+          return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+        if (action == Analysis::Update) {
+          // "acq_rel" prohibited
+          if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel)
+            return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+        }
+      }
+    }
+  }
+
+  return memOrder;
 }
 
 static mlir::omp::ClauseMemoryOrderKindAttr
@@ -449,16 +511,19 @@ void Fortran::lower::omp::lowerAtomic(
   mlir::Value atomAddr =
       fir::getBase(converter.genExprAddr(atom, stmtCtx, &loc));
   mlir::IntegerAttr hint = getAtomicHint(converter, clauses);
-  std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder =
-      getAtomicMemoryOrder(semaCtx, clauses,
-                           semaCtx.FindScope(construct.source));
+  auto [memOrder, canOverride] = getAtomicMemoryOrder(
+      semaCtx, clauses, semaCtx.FindScope(construct.source));
+
+  unsigned version = semaCtx.langOptions().OpenMPVersion;
+  int action0 = analysis.op0.what & analysis.Action;
+  int action1 = analysis.op1.what & analysis.Action;
+  if (canOverride)
+    memOrder = makeValidForAction(memOrder, action0, action1, version);
 
   if (auto *cond = get(analysis.cond)) {
     (void)cond;
     TODO(loc, "OpenMP ATOMIC COMPARE");
   } else {
-    int action0 = analysis.op0.what & analysis.Action;
-    int action1 = analysis.op1.what & analysis.Action;
     mlir::Operation *captureOp = nullptr;
     fir::FirOpBuilder::InsertPoint preAt = builder.saveInsertionPoint();
     fir::FirOpBuilder::InsertPoint atomicAt, postAt;
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 0842c62..ba34212 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -219,7 +219,6 @@ MAKE_EMPTY_CLASS(AcqRel, AcqRel);
 MAKE_EMPTY_CLASS(Acquire, Acquire);
 MAKE_EMPTY_CLASS(Capture, Capture);
 MAKE_EMPTY_CLASS(Compare, Compare);
-MAKE_EMPTY_CLASS(DynamicAllocators, DynamicAllocators);
 MAKE_EMPTY_CLASS(Full, Full);
 MAKE_EMPTY_CLASS(Inbranch, Inbranch);
 MAKE_EMPTY_CLASS(Mergeable, Mergeable);
@@ -235,13 +234,9 @@ MAKE_EMPTY_CLASS(OmpxBare, OmpxBare);
 MAKE_EMPTY_CLASS(Read, Read);
 MAKE_EMPTY_CLASS(Relaxed, Relaxed);
 MAKE_EMPTY_CLASS(Release, Release);
-MAKE_EMPTY_CLASS(ReverseOffload, ReverseOffload);
 MAKE_EMPTY_CLASS(SeqCst, SeqCst);
-MAKE_EMPTY_CLASS(SelfMaps, SelfMaps);
 MAKE_EMPTY_CLASS(Simd, Simd);
 MAKE_EMPTY_CLASS(Threads, Threads);
-MAKE_EMPTY_CLASS(UnifiedAddress, UnifiedAddress);
-MAKE_EMPTY_CLASS(UnifiedSharedMemory, UnifiedSharedMemory);
 MAKE_EMPTY_CLASS(Unknown, Unknown);
 MAKE_EMPTY_CLASS(Untied, Untied);
 MAKE_EMPTY_CLASS(Weak, Weak);
@@ -775,7 +770,18 @@ Doacross make(const parser::OmpClause::Doacross &inp,
   return makeDoacross(inp.v.v, semaCtx);
 }
 
-// DynamicAllocators: empty
+DynamicAllocators make(const parser::OmpClause::DynamicAllocators &inp,
+                       semantics::SemanticsContext &semaCtx) {
+  // inp.v -> td::optional<arser::OmpDynamicAllocatorsClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpDynamicAllocatorsClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return DynamicAllocators{/*Required=*/std::move(maybeRequired)};
+}
+
 
 DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp,
                      semantics::SemanticsContext &semaCtx) {
@@ -1338,7 +1344,18 @@ Reduction make(const parser::OmpClause::Reduction &inp,
 
 // Relaxed: empty
 // Release: empty
-// ReverseOffload: empty
+
+ReverseOffload make(const parser::OmpClause::ReverseOffload &inp,
+                    semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpReverseOffloadClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpReverseOffloadClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return ReverseOffload{/*Required=*/std::move(maybeRequired)};
+}
 
 Safelen make(const parser::OmpClause::Safelen &inp,
              semantics::SemanticsContext &semaCtx) {
@@ -1391,6 +1408,18 @@ Schedule make(const parser::OmpClause::Schedule &inp,
 
 // SeqCst: empty
 
+SelfMaps make(const parser::OmpClause::SelfMaps &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpSelfMapsClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpSelfMapsClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return SelfMaps{/*Required=*/std::move(maybeRequired)};
+}
+
 Severity make(const parser::OmpClause::Severity &inp,
               semantics::SemanticsContext &semaCtx) {
   // inp -> empty
@@ -1480,8 +1509,29 @@ To make(const parser::OmpClause::To &inp,
              /*LocatorList=*/makeObjects(t3, semaCtx)}};
 }
 
-// UnifiedAddress: empty
-// UnifiedSharedMemory: empty
+UnifiedAddress make(const parser::OmpClause::UnifiedAddress &inp,
+                    semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpUnifiedAddressClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpUnifiedAddressClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return UnifiedAddress{/*Required=*/std::move(maybeRequired)};
+}
+
+UnifiedSharedMemory make(const parser::OmpClause::UnifiedSharedMemory &inp,
+                         semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpUnifiedSharedMemoryClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpUnifiedSharedMemoryClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return UnifiedSharedMemory{/*Required=*/std::move(maybeRequired)};
+}
 
 Uniform make(const parser::OmpClause::Uniform &inp,
              semantics::SemanticsContext &semaCtx) {
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index b5771eb..d677e14 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1094,7 +1094,7 @@ TYPE_PARSER(construct<OmpBindClause>(
     "TEAMS" >> pure(OmpBindClause::Binding::Teams) ||
     "THREAD" >> pure(OmpBindClause::Binding::Thread)))
 
-TYPE_PARSER(construct<OmpAlignClause>(scalarIntExpr))
+TYPE_PARSER(construct<OmpAlignClause>(scalarIntConstantExpr))
 
 TYPE_PARSER(construct<OmpAtClause>(
     "EXECUTION" >> pure(OmpAtClause::ActionTime::Execution) ||
@@ -1167,7 +1167,8 @@ TYPE_PARSER( //
     "DOACROSS" >>
         construct<OmpClause>(parenthesized(Parser<OmpDoacrossClause>{})) ||
     "DYNAMIC_ALLOCATORS" >>
-        construct<OmpClause>(construct<OmpClause::DynamicAllocators>()) ||
+        construct<OmpClause>(construct<OmpClause::DynamicAllocators>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "DYN_GROUPPRIVATE" >>
         construct<OmpClause>(construct<OmpClause::DynGroupprivate>(
             parenthesized(Parser<OmpDynGroupprivateClause>{}))) ||
@@ -1279,12 +1280,15 @@ TYPE_PARSER( //
     "REPLAYABLE" >> construct<OmpClause>(construct<OmpClause::Replayable>(
                         maybe(parenthesized(Parser<OmpReplayableClause>{})))) ||
     "REVERSE_OFFLOAD" >>
-        construct<OmpClause>(construct<OmpClause::ReverseOffload>()) ||
+        construct<OmpClause>(construct<OmpClause::ReverseOffload>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "SAFELEN" >> construct<OmpClause>(construct<OmpClause::Safelen>(
                      parenthesized(scalarIntConstantExpr))) ||
     "SCHEDULE" >> construct<OmpClause>(construct<OmpClause::Schedule>(
                       parenthesized(Parser<OmpScheduleClause>{}))) ||
     "SEQ_CST" >> construct<OmpClause>(construct<OmpClause::SeqCst>()) ||
+    "SELF_MAPS" >> construct<OmpClause>(construct<OmpClause::SelfMaps>(
+                       maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "SEVERITY" >> construct<OmpClause>(construct<OmpClause::Severity>(
                       parenthesized(Parser<OmpSeverityClause>{}))) ||
     "SHARED" >> construct<OmpClause>(construct<OmpClause::Shared>(
@@ -1312,9 +1316,11 @@ TYPE_PARSER( //
         construct<OmpClause>(construct<OmpClause::UseDeviceAddr>(
             parenthesized(Parser<OmpObjectList>{}))) ||
     "UNIFIED_ADDRESS" >>
-        construct<OmpClause>(construct<OmpClause::UnifiedAddress>()) ||
+        construct<OmpClause>(construct<OmpClause::UnifiedAddress>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "UNIFIED_SHARED_MEMORY" >>
-        construct<OmpClause>(construct<OmpClause::UnifiedSharedMemory>()) ||
+        construct<OmpClause>(construct<OmpClause::UnifiedSharedMemory>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "UNIFORM" >> construct<OmpClause>(construct<OmpClause::Uniform>(
                      parenthesized(nonemptyList(name)))) ||
     "UNTIED" >> construct<OmpClause>(construct<OmpClause::Untied>()) ||
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index cb30939..8cbaa39 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -185,7 +185,7 @@ StructureConstructor ArrayElement::ConvertToStructureConstructor(
   std::list<ComponentSpec> components;
   for (auto &subscript : subscripts) {
     components.emplace_back(std::optional<Keyword>{},
-        ComponentDataSource{std::move(*Unwrap<Expr>(subscript))});
+        ComponentDataSource{std::move(UnwrapRef<Expr>(subscript))});
   }
   DerivedTypeSpec spec{std::move(name), std::list<TypeParamSpec>{}};
   spec.derivedTypeSpec = &derived;
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
index f4aa496..1824a7d 100644
--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -194,7 +194,8 @@ void AssignmentContext::CheckShape(parser::CharBlock at, const SomeExpr *expr) {
 
 template <typename A> void AssignmentContext::PushWhereContext(const A &x) {
   const auto &expr{std::get<parser::LogicalExpr>(x.t)};
-  CheckShape(expr.thing.value().source, GetExpr(context_, expr));
+  CheckShape(
+      parser::UnwrapRef<parser::Expr>(expr).source, GetExpr(context_, expr));
   ++whereDepth_;
 }
 
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 823aa4e..e019bbd 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -151,7 +151,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
                       [&](const parser::MsgVariable &var) {
                         WarnOnDeferredLengthCharacterScalar(context,
                             GetExpr(context, var),
-                            var.v.thing.thing.GetSource(), "ERRMSG=");
+                            parser::UnwrapRef<parser::Variable>(var)
+                                .GetSource(),
+                            "ERRMSG=");
                         if (info.gotMsg) { // C943
                           context.Say(
                               "ERRMSG may not be duplicated in a ALLOCATE statement"_err_en_US);
@@ -439,7 +441,7 @@ static bool HaveCompatibleLengths(
         evaluate::ToInt64(type1.characterTypeSpec().length().GetExplicit())};
     auto v2{
         evaluate::ToInt64(type2.characterTypeSpec().length().GetExplicit())};
-    return !v1 || !v2 || *v1 == *v2;
+    return !v1 || !v2 || (*v1 >= 0 ? *v1 : 0) == (*v2 >= 0 ? *v2 : 0);
   } else {
     return true;
   }
@@ -452,7 +454,7 @@ static bool HaveCompatibleLengths(
     auto v1{
         evaluate::ToInt64(type1.characterTypeSpec().length().GetExplicit())};
     auto v2{type2.knownLength()};
-    return !v1 || !v2 || *v1 == *v2;
+    return !v1 || !v2 || (*v1 >= 0 ? *v1 : 0) == (*v2 >= 0 ? *v2 : 0);
   } else {
     return true;
   }
@@ -598,7 +600,7 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
           std::optional<evaluate::ConstantSubscript> lbound;
           if (const auto &lb{std::get<0>(shapeSpec.t)}) {
             lbound.reset();
-            const auto &lbExpr{lb->thing.thing.value()};
+            const auto &lbExpr{parser::UnwrapRef<parser::Expr>(lb)};
             if (const auto *expr{GetExpr(context, lbExpr)}) {
               auto folded{
                   evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
@@ -609,7 +611,8 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
             lbound = 1;
           }
           if (lbound) {
-            const auto &ubExpr{std::get<1>(shapeSpec.t).thing.thing.value()};
+            const auto &ubExpr{
+                parser::UnwrapRef<parser::Expr>(std::get<1>(shapeSpec.t))};
             if (const auto *expr{GetExpr(context, ubExpr)}) {
               auto folded{
                   evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
diff --git a/flang/lib/Semantics/check-case.cpp b/flang/lib/Semantics/check-case.cpp
index 5ce143c..7593154 100644
--- a/flang/lib/Semantics/check-case.cpp
+++ b/flang/lib/Semantics/check-case.cpp
@@ -72,7 +72,7 @@ private:
   }
 
   std::optional<Value> GetValue(const parser::CaseValue &caseValue) {
-    const parser::Expr &expr{caseValue.thing.thing.value()};
+    const auto &expr{parser::UnwrapRef<parser::Expr>(caseValue)};
     auto *x{expr.typedExpr.get()};
     if (x && x->v) { // C1147
       auto type{x->v->GetType()};
diff --git a/flang/lib/Semantics/check-coarray.cpp b/flang/lib/Semantics/check-coarray.cpp
index 0e444f1..9113369 100644
--- a/flang/lib/Semantics/check-coarray.cpp
+++ b/flang/lib/Semantics/check-coarray.cpp
@@ -112,7 +112,7 @@ static void CheckTeamType(
 
 static void CheckTeamStat(
     SemanticsContext &context, const parser::ImageSelectorSpec::Stat &stat) {
-  const parser::Variable &var{stat.v.thing.thing.value()};
+  const auto &var{parser::UnwrapRef<parser::Variable>(stat)};
   if (parser::GetCoindexedNamedObject(var)) {
     context.Say(parser::FindSourceLocation(var), // C931
         "Image selector STAT variable must not be a coindexed "
@@ -147,7 +147,8 @@ static void CheckSyncStat(SemanticsContext &context,
           },
           [&](const parser::MsgVariable &var) {
             WarnOnDeferredLengthCharacterScalar(context, GetExpr(context, var),
-                var.v.thing.thing.GetSource(), "ERRMSG=");
+                parser::UnwrapRef<parser::Variable>(var).GetSource(),
+                "ERRMSG=");
             if (gotMsg) {
               context.Say( // C1172
                   "The errmsg-variable in a sync-stat-list may not be repeated"_err_en_US);
@@ -260,7 +261,9 @@ static void CheckEventWaitSpecList(SemanticsContext &context,
                       [&](const parser::MsgVariable &var) {
                         WarnOnDeferredLengthCharacterScalar(context,
                             GetExpr(context, var),
-                            var.v.thing.thing.GetSource(), "ERRMSG=");
+                            parser::UnwrapRef<parser::Variable>(var)
+                                .GetSource(),
+                            "ERRMSG=");
                         if (gotMsg) {
                           context.Say( // C1178
                               "A errmsg-variable in a event-wait-spec-list may not be repeated"_err_en_US);
diff --git a/flang/lib/Semantics/check-data.cpp b/flang/lib/Semantics/check-data.cpp
index 5459290..3bcf711 100644
--- a/flang/lib/Semantics/check-data.cpp
+++ b/flang/lib/Semantics/check-data.cpp
@@ -25,9 +25,10 @@ namespace Fortran::semantics {
 // Ensures that references to an implied DO loop control variable are
 // represented as such in the "body" of the implied DO loop.
 void DataChecker::Enter(const parser::DataImpliedDo &x) {
-  auto name{std::get<parser::DataImpliedDo::Bounds>(x.t).name.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(
+      std::get<parser::DataImpliedDo::Bounds>(x.t).name)};
   int kind{evaluate::ResultType<evaluate::ImpliedDoIndex>::kind};
-  if (const auto dynamicType{evaluate::DynamicType::From(*name.symbol)}) {
+  if (const auto dynamicType{evaluate::DynamicType::From(DEREF(name.symbol))}) {
     if (dynamicType->category() == TypeCategory::Integer) {
       kind = dynamicType->kind();
     }
@@ -36,7 +37,8 @@ void DataChecker::Enter(const parser::DataImpliedDo &x) {
 }
 
 void DataChecker::Leave(const parser::DataImpliedDo &x) {
-  auto name{std::get<parser::DataImpliedDo::Bounds>(x.t).name.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(
+      std::get<parser::DataImpliedDo::Bounds>(x.t).name)};
   exprAnalyzer_.RemoveImpliedDo(name.source);
 }
 
@@ -211,7 +213,7 @@ void DataChecker::Leave(const parser::DataIDoObject &object) {
           std::get_if<parser::Scalar<common::Indirection<parser::Designator>>>(
               &object.u)}) {
     if (MaybeExpr expr{exprAnalyzer_.Analyze(*designator)}) {
-      auto source{designator->thing.value().source};
+      auto source{parser::UnwrapRef<parser::Designator>(*designator).source};
       DataVarChecker checker{exprAnalyzer_.context(), source};
       if (checker(*expr)) {
         if (checker.HasComponentWithoutSubscripts()) { // C880
diff --git a/flang/lib/Semantics/check-deallocate.cpp b/flang/lib/Semantics/check-deallocate.cpp
index c45b585..c1ebc5f 100644
--- a/flang/lib/Semantics/check-deallocate.cpp
+++ b/flang/lib/Semantics/check-deallocate.cpp
@@ -114,7 +114,8 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
             },
             [&](const parser::MsgVariable &var) {
               WarnOnDeferredLengthCharacterScalar(context_,
-                  GetExpr(context_, var), var.v.thing.thing.GetSource(),
+                  GetExpr(context_, var),
+                  parser::UnwrapRef<parser::Variable>(var).GetSource(),
                   "ERRMSG=");
               if (gotMsg) {
                 context_.Say(
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index a2f3685..8a47340 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -535,7 +535,8 @@ private:
     if (const SomeExpr * expr{GetExpr(context_, scalarExpression)}) {
       if (!ExprHasTypeCategory(*expr, TypeCategory::Integer)) {
         // No warnings or errors for type INTEGER
-        const parser::CharBlock &loc{scalarExpression.thing.value().source};
+        parser::CharBlock loc{
+            parser::UnwrapRef<parser::Expr>(scalarExpression).source};
         CheckDoControl(loc, ExprHasTypeCategory(*expr, TypeCategory::Real));
       }
     }
@@ -552,7 +553,7 @@ private:
       CheckDoExpression(*bounds.step);
       if (IsZero(*bounds.step)) {
         context_.Warn(common::UsageWarning::ZeroDoStep,
-            bounds.step->thing.value().source,
+            parser::UnwrapRef<parser::Expr>(bounds.step).source,
             "DO step expression should not be zero"_warn_en_US);
       }
     }
@@ -615,7 +616,7 @@ private:
   // C1121 - procedures in mask must be pure
   void CheckMaskIsPure(const parser::ScalarLogicalExpr &mask) const {
     UnorderedSymbolSet references{
-        GatherSymbolsFromExpression(mask.thing.thing.value())};
+        GatherSymbolsFromExpression(parser::UnwrapRef<parser::Expr>(mask))};
     for (const Symbol &ref : OrderBySourcePosition(references)) {
       if (IsProcedure(ref) && !IsPureProcedure(ref)) {
         context_.SayWithDecl(ref, parser::Unwrap<parser::Expr>(mask)->source,
@@ -639,32 +640,33 @@ private:
   }
 
   void HasNoReferences(const UnorderedSymbolSet &indexNames,
-      const parser::ScalarIntExpr &expr) const {
-    CheckNoCollisions(GatherSymbolsFromExpression(expr.thing.thing.value()),
-        indexNames,
+      const parser::ScalarIntExpr &scalarIntExpr) const {
+    const auto &expr{parser::UnwrapRef<parser::Expr>(scalarIntExpr)};
+    CheckNoCollisions(GatherSymbolsFromExpression(expr), indexNames,
         "%s limit expression may not reference index variable '%s'"_err_en_US,
-        expr.thing.thing.value().source);
+        expr.source);
   }
 
   // C1129, names in local locality-specs can't be in mask expressions
   void CheckMaskDoesNotReferenceLocal(const parser::ScalarLogicalExpr &mask,
       const UnorderedSymbolSet &localVars) const {
-    CheckNoCollisions(GatherSymbolsFromExpression(mask.thing.thing.value()),
-        localVars,
+    const auto &expr{parser::UnwrapRef<parser::Expr>(mask)};
+    CheckNoCollisions(GatherSymbolsFromExpression(expr), localVars,
         "%s mask expression references variable '%s'"
         " in LOCAL locality-spec"_err_en_US,
-        mask.thing.thing.value().source);
+        expr.source);
   }
 
   // C1129, names in local locality-specs can't be in limit or step
   // expressions
-  void CheckExprDoesNotReferenceLocal(const parser::ScalarIntExpr &expr,
+  void CheckExprDoesNotReferenceLocal(
+      const parser::ScalarIntExpr &scalarIntExpr,
       const UnorderedSymbolSet &localVars) const {
-    CheckNoCollisions(GatherSymbolsFromExpression(expr.thing.thing.value()),
-        localVars,
+    const auto &expr{parser::UnwrapRef<parser::Expr>(scalarIntExpr)};
+    CheckNoCollisions(GatherSymbolsFromExpression(expr), localVars,
         "%s expression references variable '%s'"
         " in LOCAL locality-spec"_err_en_US,
-        expr.thing.thing.value().source);
+        expr.source);
   }
 
   // C1130, DEFAULT(NONE) locality requires names to be in locality-specs to
@@ -772,7 +774,7 @@ private:
         HasNoReferences(indexNames, std::get<2>(control.t));
         if (const auto &intExpr{
                 std::get<std::optional<parser::ScalarIntExpr>>(control.t)}) {
-          const parser::Expr &expr{intExpr->thing.thing.value()};
+          const auto &expr{parser::UnwrapRef<parser::Expr>(intExpr)};
           CheckNoCollisions(GatherSymbolsFromExpression(expr), indexNames,
               "%s step expression may not reference index variable '%s'"_err_en_US,
               expr.source);
@@ -840,7 +842,7 @@ private:
   }
   void CheckForImpureCall(const parser::ScalarIntExpr &x,
       std::optional<IndexVarKind> nesting) const {
-    const auto &parsedExpr{x.thing.thing.value()};
+    const auto &parsedExpr{parser::UnwrapRef<parser::Expr>(x)};
     auto oldLocation{context_.location()};
     context_.set_location(parsedExpr.source);
     if (const auto &typedExpr{parsedExpr.typedExpr}) {
@@ -1124,7 +1126,8 @@ void DoForallChecker::Leave(const parser::ConnectSpec &connectSpec) {
   const auto *newunit{
       std::get_if<parser::ConnectSpec::Newunit>(&connectSpec.u)};
   if (newunit) {
-    context_.CheckIndexVarRedefine(newunit->v.thing.thing);
+    context_.CheckIndexVarRedefine(
+        parser::UnwrapRef<parser::Variable>(newunit));
   }
 }
 
@@ -1166,14 +1169,14 @@ void DoForallChecker::Leave(const parser::InquireSpec &inquireSpec) {
   const auto *intVar{std::get_if<parser::InquireSpec::IntVar>(&inquireSpec.u)};
   if (intVar) {
     const auto &scalar{std::get<parser::ScalarIntVariable>(intVar->t)};
-    context_.CheckIndexVarRedefine(scalar.thing.thing);
+    context_.CheckIndexVarRedefine(parser::UnwrapRef<parser::Variable>(scalar));
   }
 }
 
 void DoForallChecker::Leave(const parser::IoControlSpec &ioControlSpec) {
   const auto *size{std::get_if<parser::IoControlSpec::Size>(&ioControlSpec.u)};
   if (size) {
-    context_.CheckIndexVarRedefine(size->v.thing.thing);
+    context_.CheckIndexVarRedefine(parser::UnwrapRef<parser::Variable>(size));
   }
 }
 
@@ -1190,16 +1193,19 @@ static void CheckIoImpliedDoIndex(
 
 void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) {
   CheckIoImpliedDoIndex(context_,
-      std::get<parser::IoImpliedDoControl>(outputImpliedDo.t).name.thing.thing);
+      parser::UnwrapRef<parser::Name>(
+          std::get<parser::IoImpliedDoControl>(outputImpliedDo.t).name));
 }
 
 void DoForallChecker::Leave(const parser::InputImpliedDo &inputImpliedDo) {
   CheckIoImpliedDoIndex(context_,
-      std::get<parser::IoImpliedDoControl>(inputImpliedDo.t).name.thing.thing);
+      parser::UnwrapRef<parser::Name>(
+          std::get<parser::IoImpliedDoControl>(inputImpliedDo.t).name));
 }
 
 void DoForallChecker::Leave(const parser::StatVariable &statVariable) {
-  context_.CheckIndexVarRedefine(statVariable.v.thing.thing);
+  context_.CheckIndexVarRedefine(
+      parser::UnwrapRef<parser::Variable>(statVariable));
 }
 
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index a1ff4b9..19059ad 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -424,8 +424,8 @@ void IoChecker::Enter(const parser::InquireSpec::CharVar &spec) {
     specKind = IoSpecKind::Dispose;
     break;
   }
-  const parser::Variable &var{
-      std::get<parser::ScalarDefaultCharVariable>(spec.t).thing.thing};
+  const auto &var{parser::UnwrapRef<parser::Variable>(
+      std::get<parser::ScalarDefaultCharVariable>(spec.t))};
   std::string what{parser::ToUpperCaseLetters(common::EnumToString(specKind))};
   CheckForDefinableVariable(var, what);
   WarnOnDeferredLengthCharacterScalar(
@@ -627,7 +627,7 @@ void IoChecker::Enter(const parser::IoUnit &spec) {
 }
 
 void IoChecker::Enter(const parser::MsgVariable &msgVar) {
-  const parser::Variable &var{msgVar.v.thing.thing};
+  const auto &var{parser::UnwrapRef<parser::Variable>(msgVar)};
   if (stmt_ == IoStmtKind::None) {
     // allocate, deallocate, image control
     CheckForDefinableVariable(var, "ERRMSG");
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 4b5610a..ea6fe43 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1517,19 +1517,42 @@ void OmpStructureChecker::Leave(const parser::OpenMPDepobjConstruct &x) {
 void OmpStructureChecker::Enter(const parser::OpenMPRequiresConstruct &x) {
   const auto &dirName{x.v.DirName()};
   PushContextAndClauseSets(dirName.source, dirName.v);
+  unsigned version{context_.langOptions().OpenMPVersion};
 
-  if (visitedAtomicSource_.empty()) {
-    return;
-  }
   for (const parser::OmpClause &clause : x.v.Clauses().v) {
     llvm::omp::Clause id{clause.Id()};
     if (id == llvm::omp::Clause::OMPC_atomic_default_mem_order) {
-      parser::MessageFormattedText txt(
-          "REQUIRES directive with '%s' clause found lexically after atomic operation without a memory order clause"_err_en_US,
-          parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(id)));
-      parser::Message message(clause.source, txt);
-      message.Attach(visitedAtomicSource_, "Previous atomic construct"_en_US);
-      context_.Say(std::move(message));
+      if (!visitedAtomicSource_.empty()) {
+        parser::MessageFormattedText txt(
+            "REQUIRES directive with '%s' clause found lexically after atomic operation without a memory order clause"_err_en_US,
+            parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(id)));
+        parser::Message message(clause.source, txt);
+        message.Attach(visitedAtomicSource_, "Previous atomic construct"_en_US);
+        context_.Say(std::move(message));
+      }
+    } else {
+      bool hasArgument{common::visit(
+          [&](auto &&s) {
+            using TypeS = llvm::remove_cvref_t<decltype(s)>;
+            if constexpr ( //
+                std::is_same_v<TypeS, parser::OmpClause::DynamicAllocators> ||
+                std::is_same_v<TypeS, parser::OmpClause::ReverseOffload> ||
+                std::is_same_v<TypeS, parser::OmpClause::SelfMaps> ||
+                std::is_same_v<TypeS, parser::OmpClause::UnifiedAddress> ||
+                std::is_same_v<TypeS, parser::OmpClause::UnifiedSharedMemory>) {
+              return s.v.has_value();
+            } else {
+              return false;
+            }
+          },
+          clause.u)};
+      if (version < 60 && hasArgument) {
+        context_.Say(clause.source,
+            "An argument to %s is an %s feature, %s"_warn_en_US,
+            parser::ToUpperCaseLetters(
+                llvm::omp::getOpenMPClauseName(clause.Id())),
+            ThisVersion(60), TryVersion(60));
+      }
     }
   }
 }
@@ -1540,9 +1563,8 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
 
 void OmpStructureChecker::CheckAlignValue(const parser::OmpClause &clause) {
   if (auto *align{std::get_if<parser::OmpClause::Align>(&clause.u)}) {
-    if (const auto &v{GetIntValue(align->v)}; !v || *v <= 0) {
-      context_.Say(clause.source,
-          "The alignment value should be a constant positive integer"_err_en_US);
+    if (const auto &v{GetIntValue(align->v)}; v && *v <= 0) {
+      context_.Say(clause.source, "The alignment should be positive"_err_en_US);
     }
   }
 }
@@ -2336,7 +2358,7 @@ private:
       }
       if (auto &repl{std::get<parser::OmpClause::Replayable>(clause.u).v}) {
         // Scalar<Logical<Constant<indirection<Expr>>>>
-        const parser::Expr &parserExpr{repl->v.thing.thing.thing.value()};
+        const auto &parserExpr{parser::UnwrapRef<parser::Expr>(repl)};
         if (auto &&expr{GetEvaluateExpr(parserExpr)}) {
           return GetLogicalValue(*expr).value_or(true);
         }
@@ -2350,7 +2372,7 @@ private:
     bool isTransparent{true};
     if (auto &transp{std::get<parser::OmpClause::Transparent>(clause.u).v}) {
       // Scalar<Integer<indirection<Expr>>>
-      const parser::Expr &parserExpr{transp->v.thing.thing.value()};
+      const auto &parserExpr{parser::UnwrapRef<parser::Expr>(transp)};
       if (auto &&expr{GetEvaluateExpr(parserExpr)}) {
         // If the argument is omp_not_impex (defined as 0), then
         // the task is not transparent, otherwise it is.
@@ -2389,8 +2411,8 @@ private:
       }
     }
     // Scalar<Logical<indirection<Expr>>>
-    auto &parserExpr{
-        std::get<parser::ScalarLogicalExpr>(ifc.v.t).thing.thing.value()};
+    const auto &parserExpr{parser::UnwrapRef<parser::Expr>(
+        std::get<parser::ScalarLogicalExpr>(ifc.v.t))};
     if (auto &&expr{GetEvaluateExpr(parserExpr)}) {
       // If the value is known to be false, an undeferred task will be
       // generated.
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index 1e46dab..bbf3b28 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -179,13 +179,14 @@ bool DataInitializationCompiler<DSV>::Scan(
 template <typename DSV>
 bool DataInitializationCompiler<DSV>::Scan(const parser::DataImpliedDo &ido) {
   const auto &bounds{std::get<parser::DataImpliedDo::Bounds>(ido.t)};
-  auto name{bounds.name.thing.thing};
-  const auto *lowerExpr{
-      GetExpr(exprAnalyzer_.context(), bounds.lower.thing.thing)};
-  const auto *upperExpr{
-      GetExpr(exprAnalyzer_.context(), bounds.upper.thing.thing)};
+  const auto &name{parser::UnwrapRef<parser::Name>(bounds.name)};
+  const auto *lowerExpr{GetExpr(
+      exprAnalyzer_.context(), parser::UnwrapRef<parser::Expr>(bounds.lower))};
+  const auto *upperExpr{GetExpr(
+      exprAnalyzer_.context(), parser::UnwrapRef<parser::Expr>(bounds.upper))};
   const auto *stepExpr{bounds.step
-          ? GetExpr(exprAnalyzer_.context(), bounds.step->thing.thing)
+          ? GetExpr(exprAnalyzer_.context(),
+                parser::UnwrapRef<parser::Expr>(bounds.step))
           : nullptr};
   if (lowerExpr && upperExpr) {
     // Fold the bounds expressions (again) in case any of them depend
@@ -240,7 +241,9 @@ bool DataInitializationCompiler<DSV>::Scan(
   return common::visit(
       common::visitors{
           [&](const parser::Scalar<common::Indirection<parser::Designator>>
-                  &var) { return Scan(var.thing.value()); },
+                  &var) {
+            return Scan(parser::UnwrapRef<parser::Designator>(var));
+          },
           [&](const common::Indirection<parser::DataImpliedDo> &ido) {
             return Scan(ido.value());
           },
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 2feec98..4aeb9a4 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -176,8 +176,8 @@ public:
 
   // Find and return a user-defined operator or report an error.
   // The provided message is used if there is no such operator.
-  MaybeExpr TryDefinedOp(
-      const char *, parser::MessageFixedText, bool isUserOp = false);
+  MaybeExpr TryDefinedOp(const char *, parser::MessageFixedText,
+      bool isUserOp = false, bool checkForNullPointer = true);
   template <typename E>
   MaybeExpr TryDefinedOp(E opr, parser::MessageFixedText msg) {
     return TryDefinedOp(
@@ -211,7 +211,8 @@ private:
   void SayNoMatch(
       const std::string &, bool isAssignment = false, bool isAmbiguous = false);
   std::string TypeAsFortran(std::size_t);
-  bool AnyUntypedOrMissingOperand() const;
+  bool AnyUntypedOperand() const;
+  bool AnyMissingOperand() const;
 
   ExpressionAnalyzer &context_;
   ActualArguments actuals_;
@@ -1954,9 +1955,10 @@ void ArrayConstructorContext::Add(const parser::AcImpliedDo &impliedDo) {
   const auto &control{std::get<parser::AcImpliedDoControl>(impliedDo.t)};
   const auto &bounds{std::get<parser::AcImpliedDoControl::Bounds>(control.t)};
   exprAnalyzer_.Analyze(bounds.name);
-  parser::CharBlock name{bounds.name.thing.thing.source};
+  const auto &parsedName{parser::UnwrapRef<parser::Name>(bounds.name)};
+  parser::CharBlock name{parsedName.source};
   int kind{ImpliedDoIntType::kind};
-  if (const Symbol * symbol{bounds.name.thing.thing.symbol}) {
+  if (const Symbol *symbol{parsedName.symbol}) {
     if (auto dynamicType{DynamicType::From(symbol)}) {
       if (dynamicType->category() == TypeCategory::Integer) {
         kind = dynamicType->kind();
@@ -1981,7 +1983,7 @@ void ArrayConstructorContext::Add(const parser::AcImpliedDo &impliedDo) {
       auto cUpper{ToInt64(upper)};
       auto cStride{ToInt64(stride)};
       if (!(messageDisplayedSet_ & 0x10) && cStride && *cStride == 0) {
-        exprAnalyzer_.SayAt(bounds.step.value().thing.thing.value().source,
+        exprAnalyzer_.SayAt(parser::UnwrapRef<parser::Expr>(bounds.step).source,
             "The stride of an implied DO loop must not be zero"_err_en_US);
         messageDisplayedSet_ |= 0x10;
       }
@@ -2526,7 +2528,7 @@ static const Symbol *GetBindingResolution(
 auto ExpressionAnalyzer::AnalyzeProcedureComponentRef(
     const parser::ProcComponentRef &pcr, ActualArguments &&arguments,
     bool isSubroutine) -> std::optional<CalleeAndArguments> {
-  const parser::StructureComponent &sc{pcr.v.thing};
+  const auto &sc{parser::UnwrapRef<parser::StructureComponent>(pcr)};
   if (MaybeExpr base{Analyze(sc.base)}) {
     if (const Symbol *sym{sc.component.symbol}) {
       if (context_.HasError(sym)) {
@@ -3695,11 +3697,12 @@ std::optional<characteristics::Procedure> ExpressionAnalyzer::CheckCall(
 
 MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr::Parentheses &x) {
   if (MaybeExpr operand{Analyze(x.v.value())}) {
-    if (const semantics::Symbol *symbol{GetLastSymbol(*operand)}) {
+    if (IsNullPointerOrAllocatable(&*operand)) {
+      Say("NULL() may not be parenthesized"_err_en_US);
+    } else if (const semantics::Symbol *symbol{GetLastSymbol(*operand)}) {
       if (const semantics::Symbol *result{FindFunctionResult(*symbol)}) {
         if (semantics::IsProcedurePointer(*result)) {
-          Say("A function reference that returns a procedure "
-              "pointer may not be parenthesized"_err_en_US); // C1003
+          Say("A function reference that returns a procedure pointer may not be parenthesized"_err_en_US); // C1003
         }
       }
     }
@@ -3788,7 +3791,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr::DefinedUnary &x) {
   ArgumentAnalyzer analyzer{*this, name.source};
   analyzer.Analyze(std::get<1>(x.t));
   return analyzer.TryDefinedOp(name.source.ToString().c_str(),
-      "No operator %s defined for %s"_err_en_US, true);
+      "No operator %s defined for %s"_err_en_US, /*isUserOp=*/true);
 }
 
 // Binary (dyadic) operations
@@ -3997,7 +4000,9 @@ static bool CheckFuncRefToArrayElement(semantics::SemanticsContext &context,
   auto &proc{std::get<parser::ProcedureDesignator>(funcRef.v.t)};
   const auto *name{std::get_if<parser::Name>(&proc.u)};
   if (!name) {
-    name = &std::get<parser::ProcComponentRef>(proc.u).v.thing.component;
+    name = &parser::UnwrapRef<parser::StructureComponent>(
+        std::get<parser::ProcComponentRef>(proc.u))
+                .component;
   }
   if (!name->symbol) {
     return false;
@@ -4047,14 +4052,16 @@ static void FixMisparsedFunctionReference(
       }
     }
     auto &proc{std::get<parser::ProcedureDesignator>(funcRef.v.t)};
-    if (Symbol *origSymbol{
-            common::visit(common::visitors{
-                              [&](parser::Name &name) { return name.symbol; },
-                              [&](parser::ProcComponentRef &pcr) {
-                                return pcr.v.thing.component.symbol;
-                              },
-                          },
-                proc.u)}) {
+    if (Symbol *
+        origSymbol{common::visit(
+            common::visitors{
+                [&](parser::Name &name) { return name.symbol; },
+                [&](parser::ProcComponentRef &pcr) {
+                  return parser::UnwrapRef<parser::StructureComponent>(pcr)
+                      .component.symbol;
+                },
+            },
+            proc.u)}) {
       Symbol &symbol{origSymbol->GetUltimate()};
       if (symbol.has<semantics::ObjectEntityDetails>() ||
           symbol.has<semantics::AssocEntityDetails>()) {
@@ -4176,15 +4183,23 @@ MaybeExpr ExpressionAnalyzer::IterativelyAnalyzeSubexpressions(
   } while (!queue.empty());
   // Analyze the collected subexpressions in bottom-up order.
   // On an error, bail out and leave partial results in place.
-  MaybeExpr result;
-  for (auto riter{finish.rbegin()}; riter != finish.rend(); ++riter) {
-    const parser::Expr &expr{**riter};
-    result = ExprOrVariable(expr, expr.source);
-    if (!result) {
-      return result;
+  if (finish.size() == 1) {
+    const parser::Expr &expr{DEREF(finish.front())};
+    return ExprOrVariable(expr, expr.source);
+  } else {
+    // NULL() operand catching is deferred to operation analysis so
+    // that they can be accepted by defined operators.
+    auto restorer{AllowNullPointer()};
+    MaybeExpr result;
+    for (auto riter{finish.rbegin()}; riter != finish.rend(); ++riter) {
+      const parser::Expr &expr{**riter};
+      result = ExprOrVariable(expr, expr.source);
+      if (!result) {
+        return result;
+      }
     }
+    return result; // last value was from analysis of "top"
   }
-  return result; // last value was from analysis of "top"
 }
 
 MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr &expr) {
@@ -4681,7 +4696,7 @@ bool ArgumentAnalyzer::AnyCUDADeviceData() const {
 // attribute.
 bool ArgumentAnalyzer::HasDeviceDefinedIntrinsicOpOverride(
     const char *opr) const {
-  if (AnyCUDADeviceData() && !AnyUntypedOrMissingOperand()) {
+  if (AnyCUDADeviceData() && !AnyUntypedOperand() && !AnyMissingOperand()) {
     std::string oprNameString{"operator("s + opr + ')'};
     parser::CharBlock oprName{oprNameString};
     parser::Messages buffer;
@@ -4709,9 +4724,9 @@ bool ArgumentAnalyzer::HasDeviceDefinedIntrinsicOpOverride(
   return false;
 }
 
-MaybeExpr ArgumentAnalyzer::TryDefinedOp(
-    const char *opr, parser::MessageFixedText error, bool isUserOp) {
-  if (AnyUntypedOrMissingOperand()) {
+MaybeExpr ArgumentAnalyzer::TryDefinedOp(const char *opr,
+    parser::MessageFixedText error, bool isUserOp, bool checkForNullPointer) {
+  if (AnyMissingOperand()) {
     context_.Say(error, ToUpperCase(opr), TypeAsFortran(0), TypeAsFortran(1));
     return std::nullopt;
   }
@@ -4790,7 +4805,9 @@ MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     context_.Say(
         "Operands of %s are not conformable; have rank %d and rank %d"_err_en_US,
         ToUpperCase(opr), actuals_[0]->Rank(), actuals_[1]->Rank());
-  } else if (CheckForNullPointer() && CheckForAssumedRank()) {
+  } else if (!CheckForAssumedRank()) {
+  } else if (checkForNullPointer && !CheckForNullPointer()) {
+  } else { // use the supplied error
     context_.Say(error, ToUpperCase(opr), TypeAsFortran(0), TypeAsFortran(1));
   }
   return result;
@@ -4808,15 +4825,16 @@ MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     for (std::size_t i{0}; i < oprs.size(); ++i) {
       parser::Messages buffer;
       auto restorer{context_.GetContextualMessages().SetMessages(buffer)};
-      if (MaybeExpr thisResult{TryDefinedOp(oprs[i], error)}) {
+      if (MaybeExpr thisResult{TryDefinedOp(oprs[i], error, /*isUserOp=*/false,
+              /*checkForNullPointer=*/false)}) {
         result = std::move(thisResult);
         hit.push_back(oprs[i]);
         hitBuffer = std::move(buffer);
       }
     }
   }
-  if (hit.empty()) { // for the error
-    result = TryDefinedOp(oprs[0], error);
+  if (hit.empty()) { // run TryDefinedOp() again just to emit errors
+    CHECK(!TryDefinedOp(oprs[0], error).has_value());
   } else if (hit.size() > 1) {
     context_.Say(
         "Matching accessible definitions were found with %zd variant spellings of the generic operator ('%s', '%s')"_err_en_US,
@@ -5232,10 +5250,19 @@ std::string ArgumentAnalyzer::TypeAsFortran(std::size_t i) {
   }
 }
 
-bool ArgumentAnalyzer::AnyUntypedOrMissingOperand() const {
+bool ArgumentAnalyzer::AnyUntypedOperand() const {
+  for (const auto &actual : actuals_) {
+    if (actual && !actual->GetType() &&
+        !IsBareNullPointer(actual->UnwrapExpr())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ArgumentAnalyzer::AnyMissingOperand() const {
   for (const auto &actual : actuals_) {
-    if (!actual ||
-        (!actual->GetType() && !IsBareNullPointer(actual->UnwrapExpr()))) {
+    if (!actual) {
       return true;
     }
   }
@@ -5268,9 +5295,9 @@ void ExprChecker::Post(const parser::DataStmtObject &obj) {
 bool ExprChecker::Pre(const parser::DataImpliedDo &ido) {
   parser::Walk(std::get<parser::DataImpliedDo::Bounds>(ido.t), *this);
   const auto &bounds{std::get<parser::DataImpliedDo::Bounds>(ido.t)};
-  auto name{bounds.name.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(bounds.name)};
   int kind{evaluate::ResultType<evaluate::ImpliedDoIndex>::kind};
-  if (const auto dynamicType{evaluate::DynamicType::From(*name.symbol)}) {
+  if (const auto dynamicType{evaluate::DynamicType::From(DEREF(name.symbol))}) {
     if (dynamicType->category() == TypeCategory::Integer) {
       kind = dynamicType->kind();
     }
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 1228493..7067ed3 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -557,6 +557,21 @@ public:
     using RequiresClauses = WithOmpDeclarative::RequiresClauses;
     PushContext(x.source, llvm::omp::Directive::OMPD_requires);
 
+    auto getArgument{[&](auto &&maybeClause) {
+      if (maybeClause) {
+        // Scalar<Logical<Constant<common::Indirection<Expr>>>>
+        auto &parserExpr{maybeClause->v.thing.thing.thing.value()};
+        evaluate::ExpressionAnalyzer ea{context_};
+        if (auto &&maybeExpr{ea.Analyze(parserExpr)}) {
+          if (auto v{omp::GetLogicalValue(*maybeExpr)}) {
+            return *v;
+          }
+        }
+      }
+      // If the argument is missing, it is assumed to be true.
+      return true;
+    }};
+
     // Gather information from the clauses.
     RequiresClauses reqs;
     const common::OmpMemoryOrderType *memOrder{nullptr};
@@ -573,16 +588,19 @@ public:
                 if constexpr ( //
                     std::is_same_v<TypeS, OmpClause::DynamicAllocators> ||
                     std::is_same_v<TypeS, OmpClause::ReverseOffload> ||
+                    std::is_same_v<TypeS, OmpClause::SelfMaps> ||
                     std::is_same_v<TypeS, OmpClause::UnifiedAddress> ||
                     std::is_same_v<TypeS, OmpClause::UnifiedSharedMemory>) {
-                  return RequiresClauses{clause.Id()};
-                } else {
-                  return RequiresClauses{};
+                  if (getArgument(s.v)) {
+                    return RequiresClauses{clause.Id()};
+                  }
                 }
+                return RequiresClauses{};
               },
           },
           clause.u);
     }
+
     // Merge clauses into parents' symbols details.
     AddOmpRequiresToScope(currScope(), &reqs, memOrder);
     return true;
diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp
index 742bb74..ac67799 100644
--- a/flang/lib/Semantics/resolve-names-utils.cpp
+++ b/flang/lib/Semantics/resolve-names-utils.cpp
@@ -492,12 +492,14 @@ bool EquivalenceSets::CheckDesignator(const parser::Designator &designator) {
             const auto &range{std::get<parser::SubstringRange>(x.t)};
             bool ok{CheckDataRef(designator.source, dataRef)};
             if (const auto &lb{std::get<0>(range.t)}) {
-              ok &= CheckSubstringBound(lb->thing.thing.value(), true);
+              ok &= CheckSubstringBound(
+                  parser::UnwrapRef<parser::Expr>(lb), true);
             } else {
               currObject_.substringStart = 1;
             }
             if (const auto &ub{std::get<1>(range.t)}) {
-              ok &= CheckSubstringBound(ub->thing.thing.value(), false);
+              ok &= CheckSubstringBound(
+                  parser::UnwrapRef<parser::Expr>(ub), false);
             }
             return ok;
           },
@@ -528,7 +530,8 @@ bool EquivalenceSets::CheckDataRef(
                         return false;
                       },
                       [&](const parser::IntExpr &y) {
-                        return CheckArrayBound(y.thing.value());
+                        return CheckArrayBound(
+                            parser::UnwrapRef<parser::Expr>(y));
                       },
                   },
                   subscript.u);
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index ae0ff9ca..699de41 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1140,7 +1140,7 @@ protected:
   std::optional<SourceName> BeginCheckOnIndexUseInOwnBounds(
       const parser::DoVariable &name) {
     std::optional<SourceName> result{checkIndexUseInOwnBounds_};
-    checkIndexUseInOwnBounds_ = name.thing.thing.source;
+    checkIndexUseInOwnBounds_ = parser::UnwrapRef<parser::Name>(name).source;
     return result;
   }
   void EndCheckOnIndexUseInOwnBounds(const std::optional<SourceName> &restore) {
@@ -2130,7 +2130,7 @@ public:
   void Post(const parser::SubstringInquiry &);
   template <typename A, typename B>
   void Post(const parser::LoopBounds<A, B> &x) {
-    ResolveName(*parser::Unwrap<parser::Name>(x.name));
+    ResolveName(parser::UnwrapRef<parser::Name>(x.name));
   }
   void Post(const parser::ProcComponentRef &);
   bool Pre(const parser::FunctionReference &);
@@ -2560,7 +2560,7 @@ KindExpr DeclTypeSpecVisitor::GetKindParamExpr(
         CHECK(!state_.originalKindParameter);
         // Save a pointer to the KIND= expression in the parse tree
         // in case we need to reanalyze it during PDT instantiation.
-        state_.originalKindParameter = &expr->thing.thing.thing.value();
+        state_.originalKindParameter = parser::Unwrap<parser::Expr>(expr);
       }
     }
     // Inhibit some errors now that will be caught later during instantiations.
@@ -5649,6 +5649,7 @@ bool DeclarationVisitor::Pre(const parser::NamedConstantDef &x) {
   if (details->init() || symbol.test(Symbol::Flag::InDataStmt)) {
     Say(name, "Named constant '%s' already has a value"_err_en_US);
   }
+  parser::CharBlock at{parser::UnwrapRef<parser::Expr>(expr).source};
   if (inOldStyleParameterStmt_) {
     // non-standard extension PARAMETER statement (no parentheses)
     Walk(expr);
@@ -5657,7 +5658,6 @@ bool DeclarationVisitor::Pre(const parser::NamedConstantDef &x) {
       SayWithDecl(name, symbol,
           "Alternative style PARAMETER '%s' must not already have an explicit type"_err_en_US);
     } else if (folded) {
-      auto at{expr.thing.value().source};
       if (evaluate::IsActuallyConstant(*folded)) {
         if (const auto *type{currScope().GetType(*folded)}) {
           if (type->IsPolymorphic()) {
@@ -5682,8 +5682,7 @@ bool DeclarationVisitor::Pre(const parser::NamedConstantDef &x) {
     // standard-conforming PARAMETER statement (with parentheses)
     ApplyImplicitRules(symbol);
     Walk(expr);
-    if (auto converted{EvaluateNonPointerInitializer(
-            symbol, expr, expr.thing.value().source)}) {
+    if (auto converted{EvaluateNonPointerInitializer(symbol, expr, at)}) {
       details->set_init(std::move(*converted));
     }
   }
@@ -6149,7 +6148,7 @@ bool DeclarationVisitor::Pre(const parser::KindParam &x) {
   if (const auto *kind{std::get_if<
           parser::Scalar<parser::Integer<parser::Constant<parser::Name>>>>(
           &x.u)}) {
-    const parser::Name &name{kind->thing.thing.thing};
+    const auto &name{parser::UnwrapRef<parser::Name>(kind)};
     if (!FindSymbol(name)) {
       Say(name, "Parameter '%s' not found"_err_en_US);
     }
@@ -7460,7 +7459,7 @@ void DeclarationVisitor::DeclareLocalEntity(
 Symbol *DeclarationVisitor::DeclareStatementEntity(
     const parser::DoVariable &doVar,
     const std::optional<parser::IntegerTypeSpec> &type) {
-  const parser::Name &name{doVar.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(doVar)};
   const DeclTypeSpec *declTypeSpec{nullptr};
   if (auto *prev{FindSymbol(name)}) {
     if (prev->owner() == currScope()) {
@@ -7893,13 +7892,14 @@ bool ConstructVisitor::Pre(const parser::DataIDoObject &x) {
   common::visit(
       common::visitors{
           [&](const parser::Scalar<Indirection<parser::Designator>> &y) {
-            Walk(y.thing.value());
-            const parser::Name &first{parser::GetFirstName(y.thing.value())};
+            const auto &designator{parser::UnwrapRef<parser::Designator>(y)};
+            Walk(designator);
+            const parser::Name &first{parser::GetFirstName(designator)};
             if (first.symbol) {
               first.symbol->set(Symbol::Flag::InDataStmt);
             }
           },
-          [&](const Indirection<parser::DataImpliedDo> &y) { Walk(y.value()); },
+          [&](const Indirection<parser::DataImpliedDo> &y) { Walk(y); },
       },
       x.u);
   return false;
@@ -8582,8 +8582,7 @@ public:
   void Post(const parser::WriteStmt &) { inAsyncIO_ = false; }
   void Post(const parser::IoControlSpec::Size &size) {
     if (const auto *designator{
-            std::get_if<common::Indirection<parser::Designator>>(
-                &size.v.thing.thing.u)}) {
+            parser::Unwrap<common::Indirection<parser::Designator>>(size)}) {
       NoteAsyncIODesignator(designator->value());
     }
   }
@@ -9175,16 +9174,17 @@ bool DeclarationVisitor::CheckNonPointerInitialization(
 }
 
 void DeclarationVisitor::NonPointerInitialization(
-    const parser::Name &name, const parser::ConstantExpr &expr) {
+    const parser::Name &name, const parser::ConstantExpr &constExpr) {
   if (CheckNonPointerInitialization(
           name, /*inLegacyDataInitialization=*/false)) {
     Symbol &ultimate{name.symbol->GetUltimate()};
     auto &details{ultimate.get<ObjectEntityDetails>()};
+    const auto &expr{parser::UnwrapRef<parser::Expr>(constExpr)};
     if (ultimate.owner().IsParameterizedDerivedType()) {
       // Save the expression for per-instantiation analysis.
-      details.set_unanalyzedPDTComponentInit(&expr.thing.value());
+      details.set_unanalyzedPDTComponentInit(&expr);
     } else if (MaybeExpr folded{EvaluateNonPointerInitializer(
-                   ultimate, expr, expr.thing.value().source)}) {
+                   ultimate, constExpr, expr.source)}) {
       details.set_init(std::move(*folded));
       ultimate.set(Symbol::Flag::InDataStmt, false);
     }
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index aef926b..b0b8d09 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -15,6 +15,13 @@ module mod1
 
   real(kind=8), device, allocatable, dimension(:) :: p
 
+  interface
+    function __sum(a_d) result(res_h)
+      integer(4), managed, intent(in) :: a_d(:,:,:,:)
+      integer(4), allocatable, managed :: res_h(:,:,:)
+    end function
+  end interface
+
 contains
   function dev1(a)
     integer, device :: a(:)
@@ -522,3 +529,33 @@ end subroutine
 ! CHECK: hlfir.yield_element %[[CONV]] : f32
 ! CHECK: }
 ! CHECKL: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<10x20x30xf32>, !fir.ref<!fir.array<10x20x30xf32>>
+
+subroutine sub28(N1,N2,N3,N4)
+  use mod1
+  integer(4), managed :: a(N1,N2,N3,N4) 
+  integer(4), managed :: bres(N1,N2,N3)
+  bres = __sum(a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub28
+! CHECK: fir.call @_QP__sum
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign
+! CHECK-NOT: cuf.data_transfer
+
+! Data transfer with conversion with more complex elemental
+! Check that the data transfer is placed before the elemental op.
+subroutine sub29()
+  real(2), device, allocatable :: a(:)
+  real(4), allocatable :: ha(:)
+  allocate(a(10))
+  allocate(ha(10))
+  ha = a
+  deallocate(a)
+end subroutine
+
+! CHECK-LABEL:  func.func @_QPsub29()
+! CHECK: %[[TMP:.*]] = fir.allocmem !fir.array<?xf16>, %24#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TMP_BUFFER:.*]]:2 = hlfir.declare %[[TMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf16>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf16>>, !fir.heap<!fir.array<?xf16>>)
+! CHECK: cuf.data_transfer %{{.*}} to %[[TMP_BUFFER]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.box<!fir.heap<!fir.array<?xf16>>>, !fir.box<!fir.array<?xf16>>
+! CHECK: hlfir.elemental 
diff --git a/flang/test/Lower/CUDA/cuda-managed.cuf b/flang/test/Lower/CUDA/cuda-managed.cuf
index e14bd84..69c9ecf 100644
--- a/flang/test/Lower/CUDA/cuda-managed.cuf
+++ b/flang/test/Lower/CUDA/cuda-managed.cuf
@@ -1,18 +1,14 @@
 ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
 
+! Check for implicit data transfer of managed variable
+
 subroutine testr2(N1,N2)
   real(4), managed :: ai4(N1,N2)
   real(4), allocatable :: bRefi4(:)
 
   integer :: i1, i2
 
-  do i2 = 1, N2
-    do i1 = 1, N1
-      ai4(i1,i2) = i1 + N1*(i2-1)
-    enddo
-  enddo
-
-  allocate(bRefi4 (N1))
+  allocate(bRefi4(N1))
   do i1 = 1, N1
     bRefi4(i1) = (ai4(i1,1)+ai4(i1,N2))*N2/2
   enddo
@@ -20,8 +16,8 @@ subroutine testr2(N1,N2)
 
 end subroutine
 
-!CHECK-LABEL: func.func @_QPtestr2
-!CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.array<?x?xf32>, %{{.*}}, %{{.*}} : index, index {bindc_name = "ai4", data_attr = #cuf.cuda<managed>, uniq_name = "_QFtestr2Eai4"} -> !fir.ref<!fir.array<?x?xf32>>
-!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOC]](%{{.*}}) {data_attr = #cuf.cuda<managed>, uniq_name = "_QFtestr2Eai4"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
-!CHECK: %[[DEST:.*]] = hlfir.designate %[[DECLARE]]#0 (%{{.*}}, %{{.*}}) : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
-!CHECK: cuf.data_transfer %{{.*}}#0 to %[[DEST]] {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<f32>, !fir.ref<f32>
+! CHECK-LABEL: func.func @_QPtestr2
+! CHECK:  %[[MANAGED:.*]]:2 = hlfir.declare %22(%23) {data_attr = #cuf.cuda<managed>, uniq_name = "_QFtestr2Eai4"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK: %[[TMP:.*]] = fir.allocmem !fir.array<?x?xf32>, %16, %21 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TMP_DECL:.*]]:2 = hlfir.declare %[[TMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>)
+! CHECK: cuf.data_transfer %[[MANAGED]]#1 to %[[TMP_DECL]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-1.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-1.f90
new file mode 100644
index 0000000..eb8c4b4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-1.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.read %{{[0-9]+}}#0 = %{{[0-9]+}}#0 memory_order(acquire)
+
+subroutine f00(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acq_rel)
+  !$omp atomic read
+    v = x
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-2.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-2.f90
new file mode 100644
index 0000000..d309a21
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-2.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.write %{{[0-9]+}}#0 = %{{[0-9]+}} memory_order(relaxed)
+
+subroutine f02(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acquire)
+  !$omp atomic write
+    x = v
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-3.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-3.f90
new file mode 100644
index 0000000..bc7529c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-3.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.update memory_order(relaxed)
+
+subroutine f05(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acq_rel)
+  !$omp atomic update
+    x = x + 1
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-4.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-4.f90
new file mode 100644
index 0000000..5cffb1a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-4.f90
@@ -0,0 +1,13 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.capture memory_order(relaxed)
+
+subroutine f06(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acquire)
+  !$omp atomic update capture
+    v = x
+    x = x + 1
+  !$omp end atomic
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-1.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-1.f90
new file mode 100644
index 0000000..55f2197
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-1.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=60 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=60 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.read %{{[0-9]+}}#0 = %{{[0-9]+}}#0 memory_order(relaxed)
+
+subroutine f01(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(release)
+  !$omp atomic read
+    v = x
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-2.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-2.f90
new file mode 100644
index 0000000..ca04879
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-2.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=60 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=60 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.write %{{[0-9]+}}#0 = %{{[0-9]+}} memory_order(relaxed)
+
+subroutine f02(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acquire)
+  !$omp atomic write
+    x = v
+end
diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90
index 8cb009d..0d247cd 100644
--- a/flang/test/Parser/OpenMP/allocate-align-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90
@@ -26,14 +26,14 @@ end program allocate_align_tree
 !CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
 !CHECK-NEXT: | | | Verbatim
 !CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '32_4'
+!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
 !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32'
 !CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
 !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
 !CHECK-NEXT: | | | OpenMPDeclarativeAllocate
 !CHECK-NEXT: | | | | Verbatim
 !CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '16_4'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
 !CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16'
 !CHECK-NEXT: | | | AllocateStmt
 
diff --git a/flang/test/Parser/OpenMP/requires.f90 b/flang/test/Parser/OpenMP/requires.f90
index 6cbb06e..8169403 100644
--- a/flang/test/Parser/OpenMP/requires.f90
+++ b/flang/test/Parser/OpenMP/requires.f90
@@ -30,4 +30,18 @@
 !PARSE-TREE: | OmpClause -> ReverseOffload
 !PARSE-TREE: | Flags = None
 
+!$omp requires self_maps(.true.) unified_address(.false.)
+
+!UNPARSE: !$OMP REQUIRES SELF_MAPS(.true._4) UNIFIED_ADDRESS(.false._4)
+
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires
+!PARSE-TREE: | OmpClauseList -> OmpClause -> SelfMaps -> OmpSelfMapsClause -> Scalar -> Logical -> Constant -> Expr = '.true._4'
+!PARSE-TREE: | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | bool = 'true'
+!PARSE-TREE: | OmpClause -> UnifiedAddress -> OmpUnifiedAddressClause -> Scalar -> Logical -> Constant -> Expr = '.false._4'
+!PARSE-TREE: | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | bool = 'false'
+!PARSE-TREE: | Flags = None
+
 end
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 4967330..88bcd6d 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -11,10 +11,10 @@ program allocate_align_tree
     integer :: z, t, xx
     t = 2
     z = 3
-    !ERROR: The alignment value should be a constant positive integer
+    !ERROR: Must be a constant value
 !$omp allocate(j) align(xx)
     !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
-    !ERROR: The alignment value should be a constant positive integer
+    !ERROR: The alignment should be positive
 !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
     allocate(j(z), xarray(t))
 end program allocate_align_tree
diff --git a/flang/test/Semantics/OpenMP/omp-common-fp-lp.f90 b/flang/test/Semantics/OpenMP/omp-common-fp-lp.f90
new file mode 100644
index 0000000..c995aa2
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/omp-common-fp-lp.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 -fsyntax-only %s 2>&1 | FileCheck %s --allow-empty
+! CHECK-NOT: error:
+
+! Regression test for issue #162033.
+! Verify that a named COMMON block can appear in a data-sharing clause together
+! with one of its members in another clause on the same construct. This is valid
+! in OpenMP >= 5.1 because:
+!  - A named COMMON in a clause is equivalent to listing all its explicit members
+!  - A list item may appear in both FIRSTPRIVATE and LASTPRIVATE on the same directive
+! OpenMP 5.0 explicitly forbade this combination.
+
+subroutine sub1()
+  common /com/ j
+  j = 10
+!$omp parallel do firstprivate(j) lastprivate(/com/)
+  do i = 1, 10
+     j = j + 1
+  end do
+!$omp end parallel do
+end
diff --git a/flang/test/Semantics/OpenMP/requires-modfile.f90 b/flang/test/Semantics/OpenMP/requires-modfile.f90
index 2f06104..52a43c2 100644
--- a/flang/test/Semantics/OpenMP/requires-modfile.f90
+++ b/flang/test/Semantics/OpenMP/requires-modfile.f90
@@ -1,4 +1,4 @@
-!RUN: %python %S/../test_modfile.py %s %flang_fc1 -fopenmp -fopenmp-version=52
+!RUN: %python %S/../test_modfile.py %s %flang_fc1 -fopenmp -fopenmp-version=60
 
 module req
 contains
@@ -16,8 +16,16 @@ end module
 module user
 ! The requirements from module req should be propagated to this module.
 use req
+  ! This has no effect, and should not be emitted.
+  !$omp requires unified_shared_memory(.false.)
 end module
 
+module fold
+  integer, parameter :: x = 10
+  integer, parameter :: y = 33
+  ! Make sure we can fold this expression to "true".
+  !$omp requires dynamic_allocators(x < y)
+end module
 
 !Expect: req.mod
 !module req
@@ -37,3 +45,10 @@ end module
 !!$omp requires atomic_default_mem_order(seq_cst)
 !!$omp requires reverse_offload
 !end
+
+!Expect: fold.mod
+!module fold
+!integer(4),parameter::x=10_4
+!integer(4),parameter::y=33_4
+!!$omp requires dynamic_allocators
+!end
diff --git a/flang/test/Semantics/OpenMP/requires10.f90 b/flang/test/Semantics/OpenMP/requires10.f90
new file mode 100644
index 0000000..9f9832d
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/requires10.f90
@@ -0,0 +1,13 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+subroutine f00(x)
+  logical :: x
+  !ERROR: An argument to REVERSE_OFFLOAD is an OpenMP v6.0 feature, try -fopenmp-version=60
+  !ERROR: Must be a constant value
+  !$omp requires reverse_offload(x)
+end
+
+subroutine f01
+  !WARNING: An argument to REVERSE_OFFLOAD is an OpenMP v6.0 feature, try -fopenmp-version=60
+  !$omp requires reverse_offload(.true.)
+end
diff --git a/flang/test/Semantics/bug163242.f90 b/flang/test/Semantics/bug163242.f90
new file mode 100644
index 0000000..5e020ae
--- /dev/null
+++ b/flang/test/Semantics/bug163242.f90
@@ -0,0 +1,5 @@
+!RUN: %flang -fc1 -fsyntax-only %s | FileCheck --allow-empty %s
+!CHECK-NOT: error:
+character(0), allocatable :: ch
+allocate(character(-1) :: ch)
+end
diff --git a/flang/test/Semantics/bug163255.f90 b/flang/test/Semantics/bug163255.f90
new file mode 100644
index 0000000..e29322a
--- /dev/null
+++ b/flang/test/Semantics/bug163255.f90
@@ -0,0 +1,21 @@
+!RUN: %flang_fc1 -fdebug-unparse %s | FileCheck %s
+module m
+  type t
+  end type
+  interface operator (==)
+    module procedure equal
+  end interface
+ contains
+  logical function equal(b1, b2)
+    class(t), pointer, intent(in) :: b1, b2
+    equal = associated(b1, b2)
+  end
+end module
+
+program test
+  use m
+  type(t), target :: target
+  class(t), pointer :: p => target
+  !CHECK: IF (equal(p,null(p))) STOP
+  if (p == null(p)) stop
+end
diff --git a/flang/test/Semantics/resolve63.f90 b/flang/test/Semantics/resolve63.f90
index 1cb8a85..0c3df2e 100644
--- a/flang/test/Semantics/resolve63.f90
+++ b/flang/test/Semantics/resolve63.f90
@@ -165,13 +165,12 @@ contains
     logical :: l
     complex :: z
     y = y + z'1'  !OK
-    !ERROR: Operands of + must be numeric; have untyped and COMPLEX(4)
+    !ERROR: No intrinsic or user-defined OPERATOR(+) matches operand types untyped and COMPLEX(4)
     z = z'1' + z
     y = +z'1'  !OK
     !ERROR: Operand of unary - must be numeric; have untyped
     y = -z'1'
-    !ERROR: Operands of + must be numeric; have LOGICAL(4) and untyped
-    y = x + z'1'
+    y = x + z'1' ! matches "add" with conversion of untyped to integer
     !ERROR: A NULL() pointer is not allowed as an operand here
     l = x /= null()
     !ERROR: A NULL() pointer is not allowed as a relational operand
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 2eb0f06..4c36ed8 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -26,13 +26,6 @@ function(_get_compile_options_from_flags output_var)
         list(APPEND compile_options "-mavx2")
         list(APPEND compile_options "-mfma")
       endif()
-      # For clang, we will build the math functions with `-fno-math-errno` so that
-      # __builtin_fma* will generate the fused-mutliply-add instructions.  We
-      # don't put the control flag to the public config yet, and see if it makes
-      # sense to just enable this flag by default.
-      if(LIBC_ADD_FNO_MATH_ERRNO)
-        list(APPEND compile_options "-fno-math-errno")
-      endif()
     endif()
     if(ADD_ROUND_OPT_FLAG)
       if(LIBC_TARGET_ARCHITECTURE_IS_X86_64)
@@ -102,6 +95,9 @@ function(_get_compile_options_from_config output_var)
 
   if(LIBC_CONF_MATH_OPTIMIZATIONS)
     list(APPEND config_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}")
+    if(LIBC_CONF_MATH_OPTIMIZATIONS MATCHES "LIBC_MATH_NO_ERRNO")
+      list(APPEND config_options "-fno-math-errno")
+    endif()
   endif()
 
   if(LIBC_CONF_ERRNO_MODE)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 19da0ad..933b81b 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -3,11 +3,6 @@ function(_get_common_test_compile_options output_var c_test flags)
   _get_compile_options_from_config(config_flags)
   _get_compile_options_from_arch(arch_flags)
 
-  # Remove -fno-math-errno if it was added.
-  if(LIBC_ADD_FNO_MATH_ERRNO)
-    list(REMOVE_ITEM compile_flags "-fno-math-errno")
-  endif()
-
   # Death test executor is only available in Linux for now.
   if(NOT ${LIBC_TARGET_OS} STREQUAL "linux")
     list(REMOVE_ITEM config_flags "-DLIBC_ADD_NULL_CHECKS")
diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json
index 796b1d8..e3703c28 100644
--- a/libc/config/baremetal/config.json
+++ b/libc/config/baremetal/config.json
@@ -36,7 +36,7 @@
   },
   "math": {
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
-      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES)"
+      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | LIBC_MATH_NO_ERRNO | LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT)"
     }
   },
   "general": {
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index 8e54e84..9affced 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -77,9 +77,9 @@ LIBC_INLINE constexpr NumberPair<T> split(T a) {
   NumberPair<T> r{0.0, 0.0};
   // CN = 2^N.
   constexpr T CN = static_cast<T>(1 << N);
-  constexpr T C = CN + 1.0;
-  double t1 = C * a;
-  double t2 = a - t1;
+  constexpr T C = CN + T(1);
+  T t1 = C * a;
+  T t2 = a - t1;
   r.hi = t1 + t2;
   r.lo = a - r.hi;
   return r;
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 5e8fe2e..0fd7e53 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -71,6 +71,9 @@ Improvements and New Features
 - The ``std::distance`` and ``std::ranges::distance`` algorithms have been optimized for segmented iterators (e.g.,
   ``std::join_view`` iterators), reducing the complexity from ``O(n)`` to ``O(n / segment_size)``. Benchmarks show
   performance improvements of over 1600x in favorable cases with large segment sizes (e.g., 1024).
+- The ``std::{fill, fill_n}`` and ``std::ranges::{fill, fill_n}`` algorithms have been optimized for segmented iterators,
+  resulting in a performance improvement of at least 10x for ``std::deque<int>`` iterators and
+  ``std::join_view<std::vector<std::vector<int>>>`` iterators.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/include/__algorithm/fill.h b/libcxx/include/__algorithm/fill.h
index 1ce3eadb..328ebb6 100644
--- a/libcxx/include/__algorithm/fill.h
+++ b/libcxx/include/__algorithm/fill.h
@@ -10,8 +10,11 @@
 #define _LIBCPP___ALGORITHM_FILL_H
 
 #include <__algorithm/fill_n.h>
+#include <__algorithm/for_each_segment.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,23 +24,40 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
-template <class _ForwardIterator, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, forward_iterator_tag) {
+template <class _ForwardIterator, class _Sentinel, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __value) {
   for (; __first != __last; ++__first)
     *__first = __value;
+  return __first;
 }
 
-template <class _RandomAccessIterator, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value, random_access_iterator_tag) {
-  std::fill_n(__first, __last - __first, __value);
+template <class _RandomAccessIterator,
+          class _Tp,
+          __enable_if_t<__has_random_access_iterator_category<_RandomAccessIterator>::value &&
+                            !__is_segmented_iterator_v<_RandomAccessIterator>,
+                        int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
+__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value) {
+  return std::__fill_n(__first, __last - __first, __value);
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template <class _SegmentedIterator, class _Tp, __enable_if_t<__is_segmented_iterator_v<_SegmentedIterator>, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+_SegmentedIterator __fill(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value) {
+  using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
+  std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+    std::__fill(__lfirst, __llast, __value);
+  });
+  return __last;
 }
+#endif // !_LIBCPP_CXX03_LANG
 
 template <class _ForwardIterator, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  std::__fill(__first, __last, __value, typename iterator_traits<_ForwardIterator>::iterator_category());
+  std::__fill(__first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 0da78e1..2bfacf3 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -9,10 +9,17 @@
 #ifndef _LIBCPP___ALGORITHM_FILL_N_H
 #define _LIBCPP___ALGORITHM_FILL_N_H
 
+#include <__algorithm/for_each_n_segment.h>
 #include <__algorithm/min.h>
 #include <__config>
 #include <__fwd/bit_reference.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
 #include <__memory/pointer_traits.h>
+#include <__type_traits/conjunction.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
+#include <__type_traits/negation.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,9 +33,38 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
-template <class _OutputIterator, class _Size, class _Tp>
+template <class _OutputIterator,
+          class _Size,
+          class _Tp
+#ifndef _LIBCPP_CXX03_LANG
+          ,
+          __enable_if_t<!_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
+                              __has_random_access_local_iterator<_OutputIterator>>::value,
+                        int> = 0
+#endif
+          >
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
+  for (; __n > 0; ++__first, (void)--__n)
+    *__first = __value;
+  return __first;
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template < class _OutputIterator,
+           class _Size,
+           class _Tp,
+           __enable_if_t<_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
+                              __has_random_access_local_iterator<_OutputIterator>>::value,
+                         int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
+  using __local_iterator_t = typename __segmented_iterator_traits<_OutputIterator>::__local_iterator;
+  return std::__for_each_n_segment(__first, __n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+    std::__fill_n(__lfirst, __llast - __lfirst, __value);
+  });
+}
+#endif // !_LIBCPP_CXX03_LANG
 
 template <bool _FillVal, class _Cp>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
@@ -70,14 +106,6 @@ __fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
 
 template <class _OutputIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
-  for (; __n > 0; ++__first, (void)--__n)
-    *__first = __value;
-  return __first;
-}
-
-template <class _OutputIterator, class _Size, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
   return std::__fill_n(__first, std::__convert_to_integral(__n), __value);
 }
diff --git a/libcxx/include/__algorithm/ranges_fill.h b/libcxx/include/__algorithm/ranges_fill.h
index c248009..814ae63 100644
--- a/libcxx/include/__algorithm/ranges_fill.h
+++ b/libcxx/include/__algorithm/ranges_fill.h
@@ -9,12 +9,14 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FILL_H
 #define _LIBCPP___ALGORITHM_RANGES_FILL_H
 
-#include <__algorithm/ranges_fill_n.h>
+#include <__algorithm/fill.h>
+#include <__algorithm/fill_n.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/dangling.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -31,12 +33,11 @@ namespace ranges {
 struct __fill {
   template <class _Type, output_iterator<const _Type&> _Iter, sentinel_for<_Iter> _Sent>
   _LIBCPP_HIDE_FROM_ABI constexpr _Iter operator()(_Iter __first, _Sent __last, const _Type& __value) const {
-    if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
-      return ranges::fill_n(__first, __last - __first, __value);
+    if constexpr (sized_sentinel_for<_Sent, _Iter>) {
+      auto __n = __last - __first;
+      return std::__fill_n(std::move(__first), __n, __value);
     } else {
-      for (; __first != __last; ++__first)
-        *__first = __value;
-      return __first;
+      return std::__fill(std::move(__first), std::move(__last), __value);
     }
   }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
index 9b403db..b0a74b8 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
@@ -17,6 +17,8 @@
 #include <array>
 #include <cassert>
 #include <cstddef>
+#include <deque>
+#include <ranges>
 #include <vector>
 
 #include "sized_allocator.h"
@@ -93,6 +95,13 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
   return true;
 }
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::fill(in.begin(), in.end(), 42);
+  assert(in == expected);
+}
+
 TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::forward_iterator_list<char*>(), Test<char>());
   types::for_each(types::forward_iterator_list<int*>(), Test<int>());
@@ -138,6 +147,20 @@ TEST_CONSTEXPR_CXX20 bool test() {
     }
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  { // Verify that join_view of vectors work properly.
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::fill(jv.begin(), jv.end(), 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
index e42172c..5dc9b90 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
@@ -17,6 +17,8 @@
 #include <array>
 #include <cassert>
 #include <cstddef>
+#include <deque>
+#include <ranges>
 #include <vector>
 
 #include "sized_allocator.h"
@@ -126,6 +128,13 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
   return true;
 }
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::fill_n(in.begin(), in.size(), 42);
+  assert(in == expected);
+}
+
 TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::forward_iterator_list<char*>(), Test<char>());
   types::for_each(types::forward_iterator_list<int*>(), Test<int>());
@@ -221,6 +230,20 @@ TEST_CONSTEXPR_CXX20 bool test() {
     }
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  {
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::fill_n(jv.begin(), std::distance(jv.begin(), jv.end()), 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
index 61a659f..7ae0a06 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <deque>
 #include <ranges>
 #include <string>
 #include <vector>
@@ -128,6 +129,13 @@ constexpr bool test_vector_bool(std::size_t N) {
 }
 #endif
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::ranges::fill(in, 42);
+  assert(in == expected);
+}
+
 constexpr bool test() {
   test_iterators<cpp17_output_iterator<int*>, sentinel_wrapper<cpp17_output_iterator<int*>>>();
   test_iterators<cpp20_output_iterator<int*>, sentinel_wrapper<cpp20_output_iterator<int*>>>();
@@ -227,6 +235,20 @@ constexpr bool test() {
   }
 #endif
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  {
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::ranges::fill(jv, 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
index 2d6e24a..25db892 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <deque>
 #include <ranges>
 #include <string>
 #include <vector>
@@ -101,6 +102,13 @@ constexpr bool test_vector_bool(std::size_t N) {
 }
 #endif
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::ranges::fill_n(std::ranges::begin(in), std::ranges::size(in), 42);
+  assert(in == expected);
+}
+
 constexpr bool test() {
   test_iterators<cpp17_output_iterator<int*>, sentinel_wrapper<cpp17_output_iterator<int*>>>();
   test_iterators<cpp20_output_iterator<int*>, sentinel_wrapper<cpp20_output_iterator<int*>>>();
@@ -175,6 +183,20 @@ constexpr bool test() {
   }
 #endif
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  {
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::ranges::fill_n(std::ranges::begin(jv), std::ranges::distance(jv), 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libunwind/src/Unwind-seh.cpp b/libunwind/src/Unwind-seh.cpp
index 8b83f10..110c598 100644
--- a/libunwind/src/Unwind-seh.cpp
+++ b/libunwind/src/Unwind-seh.cpp
@@ -174,7 +174,8 @@ _GCC_specific_handler(PEXCEPTION_RECORD ms_exc, PVOID frame, PCONTEXT ms_ctx,
     }
     // FIXME: Indicate target frame in foreign case!
     // phase 2: the clean up phase
-    RtlUnwindEx(frame, (PVOID)disp->ControlPc, ms_exc, exc, ms_ctx, disp->HistoryTable);
+    RtlUnwindEx(frame, (PVOID)disp->ControlPc, ms_exc, exc, disp->ContextRecord,
+                disp->HistoryTable);
     _LIBUNWIND_ABORT("RtlUnwindEx() failed");
   case _URC_INSTALL_CONTEXT: {
     // If we were called by __libunwind_seh_personality(), indicate that
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index de839795..220ec99 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -35,7 +35,7 @@ template <typename T> struct AssertSymbol {
                 "SymbolUnion not aligned enough");
 };
 
-LLVM_ATTRIBUTE_UNUSED static inline void assertSymbols() {
+[[maybe_unused]] static inline void assertSymbols() {
   AssertSymbol<Defined>();
   AssertSymbol<CommonSymbol>();
   AssertSymbol<Undefined>();
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 94f441b..bfd35ae 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -355,7 +355,7 @@ void multiThreadedPageInBackground(DeferredFiles &deferred) {
     // Reference all file's mmap'd pages to load them into memory.
     for (const char *page = buff.data(), *end = page + buff.size(); page < end;
          page += pageSize) {
-      LLVM_ATTRIBUTE_UNUSED volatile char t = *page;
+      [[maybe_unused]] volatile char t = *page;
       (void)t;
     }
   };
diff --git a/lldb/include/lldb/Utility/AnsiTerminal.h b/lldb/include/lldb/Utility/AnsiTerminal.h
index 41acac7..aaaf94c 100644
--- a/lldb/include/lldb/Utility/AnsiTerminal.h
+++ b/lldb/include/lldb/Utility/AnsiTerminal.h
@@ -72,6 +72,17 @@
 
 #define ANSI_ESC_START_LEN 2
 
+// Cursor Position, set cursor to position [l, c] (default = [1, 1]).
+#define ANSI_CSI_CUP(...) ANSI_ESC_START #__VA_ARGS__ "H"
+// Reset cursor to position.
+#define ANSI_CSI_RESET_CURSOR ANSI_CSI_CUP()
+// Erase In Display.
+#define ANSI_CSI_ED(opt) ANSI_ESC_START #opt "J"
+// Erase complete viewport.
+#define ANSI_CSI_ERASE_VIEWPORT ANSI_CSI_ED(2)
+// Erase scrollback.
+#define ANSI_CSI_ERASE_SCROLLBACK ANSI_CSI_ED(3)
+
 // OSC (Operating System Commands)
 // https://invisible-island.net/xterm/ctlseqs/ctlseqs.html
 #define OSC_ESCAPE_START "\033"
diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
index e59cef9..45caa1a 100644
--- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
@@ -21,6 +21,7 @@
 #include "lldb/Host/MainLoopBase.h"
 #include "lldb/Host/MemoryMonitor.h"
 #include "lldb/Host/Socket.h"
+#include "lldb/Utility/AnsiTerminal.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UriParser.h"
 #include "lldb/lldb-forward.h"
@@ -74,6 +75,7 @@ typedef int socklen_t;
 #include <netinet/in.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <termios.h>
 #include <unistd.h>
 #endif
 
@@ -261,6 +263,15 @@ static llvm::Error LaunchRunInTerminalTarget(llvm::opt::Arg &target_arg,
       files.push_back(files.back());
     if (llvm::Error err = SetupIORedirection(files))
       return err;
+  } else if ((isatty(STDIN_FILENO) != 0) &&
+             llvm::StringRef(getenv("TERM")).starts_with_insensitive("xterm")) {
+    // Clear the screen.
+    llvm::outs() << ANSI_CSI_RESET_CURSOR ANSI_CSI_ERASE_VIEWPORT
+            ANSI_CSI_ERASE_SCROLLBACK;
+    // VS Code will reuse the same terminal for the same debug configuration
+    // between runs. Clear the input buffer prior to starting the new process so
+    // prior input is not carried forward to the new debug session.
+    tcflush(STDIN_FILENO, TCIFLUSH);
   }
 
   RunInTerminalLauncherCommChannel comm_channel(comm_file);
diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index 1795d3a..171bf88 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -18,11 +18,11 @@ create the binary packages, please refer to the :doc:`ReleaseProcess` instead.
 Release Timeline
 ================
 
-LLVM is released on a time based schedule --- with major releases roughly
+LLVM is released on a time-based schedule --- with major releases roughly
 every 6 months.  In between major releases there may be dot releases.
 The release manager will determine if and when to make a dot release based
 on feedback from the community.  Typically, dot releases should be made if
-there are large number of bug-fixes in the stable branch or a critical bug
+there are a large number of bug fixes in the stable branch or a critical bug
 has been discovered that affects a large number of users.
 
 Unless otherwise stated, dot releases will follow the same procedure as
@@ -73,7 +73,7 @@ Release Process Summary
 
 * Generate and send out the second release candidate sources.  Only *critical*
   bugs found during this testing phase will be fixed.  Any bugs introduced by
-  merged patches will be fixed.  If so a third round of testing is needed.
+  merged patches will be fixed.  If so, a third round of testing is needed.
 
 * The release notes are updated.
 
@@ -107,15 +107,15 @@ Create Release Branch and Update LLVM Version
 Branch the Git trunk using the following procedure:
 
 #. Remind developers that the release branching is imminent and to refrain from
-   committing patches that might break the build.  E.g., new features, large
+   committing patches that might break the build, e.g., new features, large
    patches for works in progress, an overhaul of the type system, an exciting
    new TableGen feature, etc.
 
 #. Verify that the current git trunk is in decent shape by
    examining nightly tester and buildbot results.
 
-#. Bump the version in trunk to N.0.0git with the script in
-   ``llvm/utils/release/bump-version.py``, and tag the commit with llvmorg-N-init.
+#. Bump the version in trunk to ``N.0.0git`` with the script in
+   ``llvm/utils/release/bump-version.py``, and tag the commit with ``llvmorg-N-init``.
    If ``X`` is the version to be released, then ``N`` is ``X + 1``. ::
 
     $ git tag -sa llvmorg-N-init
@@ -124,14 +124,14 @@ Branch the Git trunk using the following procedure:
    ``llvm/utils/release/clear-release-notes.py``.
 
 #. Create the release branch from the last known good revision from before the
-   version bump.  The branch's name is release/X.x where ``X`` is the major version
+   version bump.  The branch's name is ``release/X.x`` where ``X`` is the major version
    number and ``x`` is just the letter ``x``.
 
 #. On the newly-created release branch, immediately bump the version
-   to X.1.0git (where ``X`` is the major version of the branch.)
+   to ``X.1.0git`` (where ``X`` is the major version of the branch.)
 
-#. All tags and branches need to be created in both the llvm/llvm-project and
-   llvm/llvm-test-suite repos.
+#. All tags and branches need to be created in both the ``llvm/llvm-project`` and
+   ``llvm/llvm-test-suite`` repos.
 
 Tagging the LLVM Release Candidates
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -157,7 +157,7 @@ the release page.
   $ for f in *.xz; do gh attestation verify --owner llvm $f && gpg -b $f; done
 
 Tarballs, release binaries,  or any other release artifacts must be uploaded to
-GitHub.  This can be done using the github-upload-release.py script in utils/release.
+GitHub.  This can be done using the ``github-upload-release.py`` script in ``utils/release``.
 
 ::
 
@@ -170,10 +170,10 @@ Build The Binary Distribution
 Creating the binary distribution requires following the instructions
 :doc:`here <ReleaseProcess>`.
 
-That process will perform both Release+Asserts and Release builds but only
-pack the Release build for upload. You should use the Release+Asserts sysroot,
+That process performs both Release+Asserts and Release builds but only packs
+the Release build for upload. You should use the Release+Asserts sysroot,
 normally under ``final/Phase3/Release+Asserts/llvmCore-3.8.1-RCn.install/``,
-for test-suite and run-time benchmarks, to make sure nothing serious has
+for test-suite and run-time benchmarks, to ensure nothing serious has
 passed through the net. For compile-time benchmarks, use the Release version.
 
 The minimum required version of the tools you'll need are :doc:`here <GettingStarted>`
@@ -181,14 +181,14 @@ The minimum required version of the tools you'll need are :doc:`here <GettingSta
 Release Qualification Criteria
 ------------------------------
 
-There are no official release qualification criteria.  It is up to the
-the release manager to determine when a release is ready.  The release manager
+There are no official release qualification criteria.
+The release manager determines when a release is ready.  The release manager
 should pay attention to the results of community testing, the number of outstanding
-bugs, and then number of regressions when determining whether or not to make a
+bugs, and the number of regressions when determining whether or not to make a
 release.
 
 The community values time based releases, so releases should not be delayed for
-too long unless there are critical issues remaining.  In most cases, the only
+too long unless critical issues remain.  In most cases, the only
 kind of bugs that are critical enough to block a release would be a major regression
 from a previous release.
 
@@ -199,33 +199,33 @@ A few developers in the community have dedicated time to validate the release
 candidates and volunteered to be the official release testers for each
 architecture.
 
-These will be the ones testing, generating and uploading the official binaries
+These will be the ones testing, generating, and uploading the official binaries
 to the server, and will be the minimum tests *necessary* for the release to
 proceed.
 
 This will obviously not cover all OSs and distributions, so additional community
-validation is important. However, if community input is not reached before the
-release is out, all bugs reported will have to go on the next stable release.
+validation is important. However, if community input is not received before the
+release, all reported bugs will be deferred to the next stable release.
 
 The official release managers are:
 
 * Even releases: Tom Stellard (tstellar@redhat.com)
 * Odd releases: Tobias Hieta (tobias@hieta.se)
 
-The official release testers are volunteered from the community and have
+The official release testers are volunteers from the community who have
 consistently validated and released binaries for their targets/OSs. To contact
 them, you should post on the `Discourse forums (Project
 Infrastructure - Release Testers). <https://discourse.llvm.org/c/infrastructure/release-testers/66>`_
 
-The official testers list is in the file `RELEASE_TESTERS.TXT
+The official testers list is in the file ``RELEASE_TESTERS.TXT``
 <https://github.com/llvm/llvm-project/blob/main/llvm/RELEASE_TESTERS.TXT>`_, in
 the LLVM repository.
 
 Community Testing
 -----------------
 
-Once all testing has been completed and appropriate bugs filed, the release
-candidate tarballs are put on the website and the LLVM community is notified.
+Once all testing is complete and appropriate bugs are filed, the release
+candidate tarballs are put on the website, and the LLVM community is notified.
 
 We ask that all LLVM developers test the release in any the following ways:
 
@@ -251,7 +251,7 @@ We ask that all LLVM developers test the release in any the following ways:
    architecture.
 
 We also ask that the OS distribution release managers test their packages with
-the first candidate of every release, and report any *new* errors in GitHub.
+the first candidate of every release and report any *new* errors in GitHub.
 If the bug can be reproduced with an unpatched upstream version of the release
 candidate (as opposed to the distribution's own build), the priority should be
 release blocker.
@@ -268,10 +268,10 @@ next stage.
 Reporting Regressions
 ---------------------
 
-Every regression that is found during the tests (as per the criteria above),
+Every regression found during the tests (as per the criteria above)
 should be filled in a bug in GitHub and added to the release milestone.
 
-If a bug can't be reproduced, or stops being a blocker, it should be removed
+If a bug can't be reproduced or stops being a blocker, it should be removed
 from the Milestone. Debugging can continue, but on trunk.
 
 Backport Requests
@@ -299,15 +299,15 @@ This section describes how to triage bug reports:
    to see the list of bugs that are being considered for the release.
 
 #. Review each bug and first check if it has been fixed in main.  If it has, update
-   its status to "Needs Pull Request", and create a pull request for the fix
-   using the /cherry-pick or /branch comments if this has not been done already.
+   its status to "Needs Pull Request" and create a pull request for the fix
+   using the ``/cherry-pick`` or ``/branch`` comments if this has not been done already.
 
 #. If a bug has been fixed and has a pull request created for backporting it,
    then update its status to "Needs Review" and notify a knowledgeable
    reviewer.  Usually you will want to notify the person who approved the
    patch, but you may use your best judgement on who a good reviewer would be.
    Once you have identified the reviewer(s), assign the issue to them and
-   mention them (i.e @username) in a comment and ask them if the patch is safe
+   mention them (i.e., ``@username``) in a comment and ask them if the patch is safe
    to backport.  You should also review the bug yourself to ensure that it
    meets the requirements for committing to the release branch.
 
@@ -323,11 +323,11 @@ Release Patch Rules
 Below are the rules regarding patching the release branch:
 
 #. Patches applied to the release branch may only be applied by the release
-   manager, the official release testers or the maintainers with approval from
+   manager, the official release testers, or the maintainers with approval from
    the release manager.
 
 #. Release managers are encouraged, but not required, to get approval from a
-   maintainer before approving patches.  If there are no reachable maintainers
+   maintainer before approving patches.  If there are no reachable maintainers,
    then release managers can ask approval from patch reviewers or other
    developers active in that area.
 
@@ -336,7 +336,7 @@ Below are the rules regarding patching the release branch:
    was created.  As with all phases, release managers and maintainers can reject
    patches that are deemed too invasive.
 
-#. *Before RC2/RC3* Patches should be limited to bug fixes or backend specific
+#. *Before RC2/RC3* Patches should be limited to bug fixes or backend-specific
    improvements that are determined to be very safe.
 
 #. *Before Final Major Release* Patches should be limited to critical
@@ -349,7 +349,7 @@ Below are the rules regarding patching the release branch:
 Release Final Tasks
 -------------------
 
-The final stages of the release process involves tagging the "final" release
+The final stages of the release process involve tagging the "final" release
 branch, updating documentation that refers to the release, and updating the
 demo page.
 
@@ -394,11 +394,11 @@ is what to do:
 #. Update the ``releases/index.html`` with the new release and link to release
    documentation.
 
-#. After you push the changes to the www-releases repo, someone with admin
-   access must login to prereleases-origin.llvm.org and manually pull the new
-   changes into /data/www-releases/.  This is where the website is served from.
+#. After you push the changes to the ``www-releases`` repo, someone with admin
+   access must log in to ``prereleases-origin.llvm.org`` and manually pull the new
+   changes into ``/data/www-releases/``. This is where the website is served from.
 
-#. Finally checkout the llvm-www repo and update the main page
+#. Finally, check out the ``llvm-www`` repo and update the main page
    (``index.html`` and sidebar) to point to the new release and release
    announcement.
 
@@ -414,5 +414,5 @@ using this command and add it to the post.
 
   $ git log --format="- %aN: [%s (%h)](https://github.com/llvm/llvm-project/commit/%H)" llvmorg-X.1.N-1..llvmorg-X.1.N
 
-Once the release has been announced add a link to the announcement on the llvm
-homepage (from the llvm-www repo) in the "Release Emails" section.
+Once the release has been announced, add a link to the announcement on the llvm
+homepage (from the ``llvm-www`` repo) in the "Release Emails" section.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index c352cd6..9cdd983 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -140,6 +140,7 @@ Changes to the X86 Backend
 --------------------------
 
 * `-mcpu=wildcatlake` is now supported.
+* `-mcpu=novalake` is now supported.
 
 Changes to the OCaml bindings
 -----------------------------
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index 88ac0a1..c7aff16 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -243,6 +243,25 @@ public:
   }
 };
 
+/// The dx.Padding target extension type
+///
+/// `target("dx.Padding", NumBytes)`
+class PaddingExtType : public TargetExtType {
+public:
+  PaddingExtType() = delete;
+  PaddingExtType(const PaddingExtType &) = delete;
+  PaddingExtType &operator=(const PaddingExtType &) = delete;
+
+  unsigned getNumBytes() const { return getIntParameter(0); }
+
+  static bool classof(const TargetExtType *T) {
+    return T->getName() == "dx.Padding";
+  }
+  static bool classof(const Type *T) {
+    return isa<TargetExtType>(T) && classof(cast<TargetExtType>(T));
+  }
+};
+
 //===----------------------------------------------------------------------===//
 
 class ResourceTypeInfo {
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index a7a6a27..0ecb114 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -617,7 +617,7 @@ public:
 };
 
 /// Function to print a loop's contents as LLVM's text IR assembly.
-LLVM_ABI void printLoop(Loop &L, raw_ostream &OS,
+LLVM_ABI void printLoop(const Loop &L, raw_ostream &OS,
                         const std::string &Banner = "");
 
 /// Find and return the loop attribute node for the attribute @p Name in
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 871028d..68198ec 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -95,6 +95,10 @@ inline bind_ty<const SCEVAddExpr> m_scev_Add(const SCEVAddExpr *&V) {
   return V;
 }
 
+inline bind_ty<const SCEVMulExpr> m_scev_Mul(const SCEVMulExpr *&V) {
+  return V;
+}
+
 /// Match a specified const SCEV *.
 struct specificscev_ty {
   const SCEV *Expr;
@@ -284,14 +288,10 @@ template <typename Op0_t, typename Op1_t> struct SCEVURem_match {
                          << SE.getTypeSizeInBits(TruncTy));
       return Op0.match(LHS) && Op1.match(RHS);
     }
-    const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
-    if (Add == nullptr || Add->getNumOperands() != 2)
-      return false;
-
-    const SCEV *A = Add->getOperand(1);
-    const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
 
-    if (Mul == nullptr)
+    const SCEV *A;
+    const SCEVMulExpr *Mul;
+    if (!SCEVPatternMatch::match(Expr, m_scev_Add(m_scev_Mul(Mul), m_SCEV(A))))
       return false;
 
     const auto MatchURemWithDivisor = [&](const SCEV *B) {
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index e619b18..8d0dc64 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -647,6 +647,7 @@ enum {
   EF_HEXAGON_ISA_V85 = 0x00000085,  // Hexagon V85 ISA
   EF_HEXAGON_ISA_V87 = 0x00000087,  // Hexagon V87 ISA
   EF_HEXAGON_ISA_V89 = 0x00000089,  // Hexagon V89 ISA
+  EF_HEXAGON_ISA_V91 = 0x00000091,  // Hexagon V91 ISA
   EF_HEXAGON_ISA = 0x000003ff,      // Hexagon V.. ISA
 
   // Tiny core flag, bit[15]
@@ -680,6 +681,7 @@ enum {
   EF_HEXAGON_MACH_V85 = EF_HEXAGON_ISA_V85,      // Hexagon V85
   EF_HEXAGON_MACH_V87 = EF_HEXAGON_ISA_V87,      // Hexagon V87
   EF_HEXAGON_MACH_V89 = EF_HEXAGON_ISA_V89,      // Hexagon V89
+  EF_HEXAGON_MACH_V91 = EF_HEXAGON_ISA_V91,      // Hexagon V91
 
   EF_HEXAGON_MACH = 0x0000ffff, // Hexagon V..
 };
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index d49bade..1a01fa6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -571,7 +571,9 @@ struct DoacrossT {
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct DynamicAllocatorsT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 template <typename T, typename I, typename E> //
@@ -1056,7 +1058,9 @@ struct ReplayableT {
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct ReverseOffloadT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 // V5.2: [10.4.2] `safelen` clause
@@ -1078,6 +1082,14 @@ struct ScheduleT {
   std::tuple<Kind, OPT(OrderingModifier), OPT(ChunkModifier), OPT(ChunkSize)> t;
 };
 
+// [6.0:361]
+template <typename T, typename I, typename E> //
+struct SelfMapsT {
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
+};
+
 // V5.2: [15.8.1] Memory-order clauses
 template <typename T, typename I, typename E> //
 struct SeqCstT {
@@ -1169,18 +1181,17 @@ struct TransparentT {
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct UnifiedAddressT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct UnifiedSharedMemoryT {
-  using EmptyTrait = std::true_type;
-};
-
-template <typename T, typename I, typename E> //
-struct SelfMapsT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 // V5.2: [5.10] `uniform` clause
@@ -1288,14 +1299,12 @@ using ExtensionClausesT =
 template <typename T, typename I, typename E>
 using EmptyClausesT = std::variant<
     AcqRelT<T, I, E>, AcquireT<T, I, E>, CaptureT<T, I, E>, CompareT<T, I, E>,
-    DynamicAllocatorsT<T, I, E>, FullT<T, I, E>, InbranchT<T, I, E>,
-    MergeableT<T, I, E>, NogroupT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
+    FullT<T, I, E>, InbranchT<T, I, E>, MergeableT<T, I, E>, NogroupT<T, I, E>,
+    NoOpenmpConstructsT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
     NoOpenmpT<T, I, E>, NoParallelismT<T, I, E>, NotinbranchT<T, I, E>,
     NowaitT<T, I, E>, ReadT<T, I, E>, RelaxedT<T, I, E>, ReleaseT<T, I, E>,
-    ReverseOffloadT<T, I, E>, SeqCstT<T, I, E>, SimdT<T, I, E>,
-    ThreadsT<T, I, E>, UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>,
-    UnknownT<T, I, E>, UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>,
-    WriteT<T, I, E>, NoOpenmpConstructsT<T, I, E>, SelfMapsT<T, I, E>>;
+    SeqCstT<T, I, E>, SimdT<T, I, E>, ThreadsT<T, I, E>, UnknownT<T, I, E>,
+    UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>, WriteT<T, I, E>>;
 
 template <typename T, typename I, typename E>
 using IncompleteClausesT =
@@ -1323,18 +1332,20 @@ using WrapperClausesT = std::variant<
     AtomicDefaultMemOrderT<T, I, E>, AtT<T, I, E>, BindT<T, I, E>,
     CollapseT<T, I, E>, ContainsT<T, I, E>, CopyinT<T, I, E>,
     CopyprivateT<T, I, E>, DefaultT<T, I, E>, DestroyT<T, I, E>,
-    DetachT<T, I, E>, DeviceTypeT<T, I, E>, EnterT<T, I, E>,
-    ExclusiveT<T, I, E>, FailT<T, I, E>, FilterT<T, I, E>, FinalT<T, I, E>,
-    FirstprivateT<T, I, E>, HasDeviceAddrT<T, I, E>, HintT<T, I, E>,
-    HoldsT<T, I, E>, InclusiveT<T, I, E>, IndirectT<T, I, E>,
+    DetachT<T, I, E>, DeviceTypeT<T, I, E>, DynamicAllocatorsT<T, I, E>,
+    EnterT<T, I, E>, ExclusiveT<T, I, E>, FailT<T, I, E>, FilterT<T, I, E>,
+    FinalT<T, I, E>, FirstprivateT<T, I, E>, HasDeviceAddrT<T, I, E>,
+    HintT<T, I, E>, HoldsT<T, I, E>, InclusiveT<T, I, E>, IndirectT<T, I, E>,
     InitializerT<T, I, E>, IsDevicePtrT<T, I, E>, LinkT<T, I, E>,
     MessageT<T, I, E>, NocontextT<T, I, E>, NontemporalT<T, I, E>,
     NovariantsT<T, I, E>, NumTeamsT<T, I, E>, NumThreadsT<T, I, E>,
     OrderedT<T, I, E>, PartialT<T, I, E>, PriorityT<T, I, E>, PrivateT<T, I, E>,
-    ProcBindT<T, I, E>, SafelenT<T, I, E>, SeverityT<T, I, E>, SharedT<T, I, E>,
-    SimdlenT<T, I, E>, SizesT<T, I, E>, PermutationT<T, I, E>,
-    ThreadLimitT<T, I, E>, UniformT<T, I, E>, UpdateT<T, I, E>,
-    UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>, UsesAllocatorsT<T, I, E>>;
+    ProcBindT<T, I, E>, ReverseOffloadT<T, I, E>, SafelenT<T, I, E>,
+    SelfMapsT<T, I, E>, SeverityT<T, I, E>, SharedT<T, I, E>, SimdlenT<T, I, E>,
+    SizesT<T, I, E>, PermutationT<T, I, E>, ThreadLimitT<T, I, E>,
+    UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>, UniformT<T, I, E>,
+    UpdateT<T, I, E>, UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>,
+    UsesAllocatorsT<T, I, E>>;
 
 template <typename T, typename I, typename E>
 using UnionOfAllClausesT = typename type::Union< //
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 86a9e24..edcf7a9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -177,6 +177,8 @@ def OMPC_Doacross : Clause<[Spelling<"doacross">]> {
 }
 def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> {
   let clangClass = "OMPDynamicAllocatorsClause";
+  let flangClass = "OmpDynamicAllocatorsClause";
+  let isValueOptional = true;
 }
 def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> {
   let flangClass = "OmpDynGroupprivateClause";
@@ -467,6 +469,8 @@ def OMPC_Replayable : Clause<[Spelling<"replayable">]> {
 }
 def OMPC_ReverseOffload : Clause<[Spelling<"reverse_offload">]> {
   let clangClass = "OMPReverseOffloadClause";
+  let flangClass = "OmpReverseOffloadClause";
+  let isValueOptional = true;
 }
 def OMPC_SafeLen : Clause<[Spelling<"safelen">]> {
   let clangClass = "OMPSafelenClause";
@@ -541,12 +545,18 @@ def OMPC_Transparent : Clause<[Spelling<"transparent">]> {
 }
 def OMPC_UnifiedAddress : Clause<[Spelling<"unified_address">]> {
   let clangClass = "OMPUnifiedAddressClause";
+  let flangClass = "OmpUnifiedAddressClause";
+  let isValueOptional = true;
 }
 def OMPC_UnifiedSharedMemory : Clause<[Spelling<"unified_shared_memory">]> {
   let clangClass = "OMPUnifiedSharedMemoryClause";
+  let flangClass = "OmpUnifiedSharedMemoryClause";
+  let isValueOptional = true;
 }
 def OMPC_SelfMaps : Clause<[Spelling<"self_maps">]> {
   let clangClass = "OMPSelfMapsClause";
+  let flangClass = "OmpSelfMapsClause";
+  let isValueOptional = true;
 }
 def OMPC_Uniform : Clause<[Spelling<"uniform">]> {
   let flangClass = "Name";
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 3a9a7f7..000472f 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -105,12 +105,6 @@ setupStatsFile(StringRef StatsFilename);
 /// ordered indices to elements in the input array.
 LLVM_ABI std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
 
-/// Updates MemProf attributes (and metadata) based on whether the index
-/// has recorded that we are linking with allocation libraries containing
-/// the necessary APIs for downstream transformations.
-LLVM_ABI void updateMemProfAttributes(Module &Mod,
-                                      const ModuleSummaryIndex &Index);
-
 class LTO;
 struct SymbolResolution;
 
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index faaff4a..4aa6c01 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -121,6 +121,7 @@ enum attributeBits {
              "The Dynamic Duo!  Prefer over all else because this changes "    \
              "most operands' meaning")                                         \
   ENUM_ENTRY(IC_64BIT_REX2, 2, "requires a REX2 prefix")                       \
+  ENUM_ENTRY(IC_64BIT_REX2_REXW, 3, "requires a REX2 and the W prefix")        \
   ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix")                               \
   ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix")                   \
   ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix")                   \
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 254587b..a94eab1 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -108,6 +108,7 @@ X86_CPU_SUBTYPE(INTEL_COREI7_ARROWLAKE_S,    "arrowlake-s")
 X86_CPU_SUBTYPE(INTEL_COREI7_PANTHERLAKE,    "pantherlake")
 X86_CPU_SUBTYPE(AMDFAM1AH_ZNVER5,            "znver5")
 X86_CPU_SUBTYPE(INTEL_COREI7_DIAMONDRAPIDS,  "diamondrapids")
+X86_CPU_SUBTYPE(INTEL_COREI7_NOVALAKE,       "novalake")
 
 // Alternate names supported by __builtin_cpu_is and target multiversioning.
 X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "raptorlake")
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h
index e4c43cd..80f3d35 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.h
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.h
@@ -117,6 +117,7 @@ enum CPUKind {
   CK_Lunarlake,
   CK_Pantherlake,
   CK_Wildcatlake,
+  CK_Novalake,
   CK_Sierraforest,
   CK_Grandridge,
   CK_Graniterapids,
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index f2de083..576f1eb 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -95,6 +95,16 @@ public:
            function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
                isPrevailing);
 };
+
+/// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+/// when we don't have an index that has recorded that we are linking with
+/// allocation libraries containing the necessary APIs for downstream
+/// transformations.
+class MemProfRemoveInfo : public PassInfoMixin<MemProfRemoveInfo> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index b78cc03e..f9bf092 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -281,6 +281,38 @@ static StructType *getOrCreateElementStruct(Type *ElemType, StringRef Name) {
   return StructType::create(ElemType, Name);
 }
 
+static Type *getTypeWithoutPadding(Type *Ty) {
+  // Recursively remove padding from structures.
+  if (auto *ST = dyn_cast<StructType>(Ty)) {
+    LLVMContext &Ctx = Ty->getContext();
+    SmallVector<Type *> ElementTypes;
+    ElementTypes.reserve(ST->getNumElements());
+    for (Type *ElTy : ST->elements()) {
+      if (isa<PaddingExtType>(ElTy))
+        continue;
+      ElementTypes.push_back(getTypeWithoutPadding(ElTy));
+    }
+
+    // Handle explicitly padded cbuffer arrays like { [ n x paddedty ], ty }
+    if (ElementTypes.size() == 2)
+      if (auto *AT = dyn_cast<ArrayType>(ElementTypes[0]))
+        if (ElementTypes[1] == AT->getElementType())
+          return ArrayType::get(ElementTypes[1], AT->getNumElements() + 1);
+
+    // If we only have a single element, don't wrap it in a struct.
+    if (ElementTypes.size() == 1)
+      return ElementTypes[0];
+
+    return StructType::get(Ctx, ElementTypes, /*IsPacked=*/false);
+  }
+  // Arrays just need to have their element type adjusted.
+  if (auto *AT = dyn_cast<ArrayType>(Ty))
+    return ArrayType::get(getTypeWithoutPadding(AT->getElementType()),
+                          AT->getNumElements());
+  // Anything else should be good as is.
+  return Ty;
+}
+
 StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
   SmallString<64> TypeName;
 
@@ -334,14 +366,21 @@ StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
   }
   case ResourceKind::CBuffer: {
     auto *RTy = cast<CBufferExtType>(HandleTy);
-    LayoutExtType *LayoutType = cast<LayoutExtType>(RTy->getResourceType());
-    StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
     SmallString<64> Name = getResourceKindName(Kind);
     if (!CBufferName.empty()) {
       Name.append(".");
       Name.append(CBufferName);
     }
-    return StructType::create(Ty->elements(), Name);
+
+    // TODO: Remove this when we update the frontend to use explicit padding.
+    if (LayoutExtType *LayoutType =
+            dyn_cast<LayoutExtType>(RTy->getResourceType())) {
+      StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
+      return StructType::create(Ty->elements(), Name);
+    }
+
+    return getOrCreateElementStruct(
+        getTypeWithoutPadding(RTy->getResourceType()), Name);
   }
   case ResourceKind::Sampler: {
     auto *RTy = cast<SamplerExtType>(HandleTy);
@@ -454,10 +493,10 @@ uint32_t ResourceTypeInfo::getCBufferSize(const DataLayout &DL) const {
 
   Type *ElTy = cast<CBufferExtType>(HandleTy)->getResourceType();
 
+  // TODO: Remove this when we update the frontend to use explicit padding.
   if (auto *LayoutTy = dyn_cast<LayoutExtType>(ElTy))
     return LayoutTy->getSize();
 
-  // TODO: What should we do with unannotated arrays?
   return DL.getTypeAllocSize(ElTy);
 }
 
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index a8c3173..d84721b 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -986,8 +986,8 @@ PreservedAnalyses LoopPrinterPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
-
+void llvm::printLoop(const Loop &L, raw_ostream &OS,
+                     const std::string &Banner) {
   if (forcePrintModuleIR()) {
     // handling -print-module-scope
     OS << Banner << " (loop: ";
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index a64b93d..442b9d1 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -4623,17 +4623,11 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
 
 /// If Expr computes ~A, return A else return nullptr
 static const SCEV *MatchNotExpr(const SCEV *Expr) {
-  const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
-  if (!Add || Add->getNumOperands() != 2 ||
-      !Add->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
-  if (!AddRHS || AddRHS->getNumOperands() != 2 ||
-      !AddRHS->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  return AddRHS->getOperand(1);
+  const SCEV *MulOp;
+  if (match(Expr, m_scev_Add(m_scev_AllOnes(),
+                             m_scev_Mul(m_scev_AllOnes(), m_SCEV(MulOp)))))
+    return MulOp;
+  return nullptr;
 }
 
 /// Return a SCEV corresponding to ~V = -1-V
@@ -12220,12 +12214,11 @@ ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) {
     // Try to match a common constant multiply.
     auto MatchConstMul =
         [](const SCEV *S) -> std::optional<std::pair<const SCEV *, APInt>> {
-      auto *M = dyn_cast<SCEVMulExpr>(S);
-      if (!M || M->getNumOperands() != 2 ||
-          !isa<SCEVConstant>(M->getOperand(0)))
-        return std::nullopt;
-      return {
-          {M->getOperand(1), cast<SCEVConstant>(M->getOperand(0))->getAPInt()}};
+      const APInt *C;
+      const SCEV *Op;
+      if (match(S, m_scev_Mul(m_scev_APInt(C), m_SCEV(Op))))
+        return {{Op, *C}};
+      return std::nullopt;
     };
     if (auto MatchedMore = MatchConstMul(More)) {
       if (auto MatchedLess = MatchConstMul(Less)) {
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 9db48e8..0e9535d 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -1034,6 +1034,10 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
   }
 
   // DirectX resources
+  if (Name == "dx.Padding")
+    return TargetTypeInfo(
+        ArrayType::get(Type::getInt8Ty(C), Ty->getIntParameter(0)),
+        TargetExtType::CanBeGlobal);
   if (Name.starts_with("dx."))
     return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal,
                           TargetExtType::CanBeLocal,
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e6544f3..aec8891 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1257,38 +1257,6 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
   return Result;
 }
 
-void lto::updateMemProfAttributes(Module &Mod,
-                                  const ModuleSummaryIndex &Index) {
-  llvm::TimeTraceScope timeScope("LTO update memprof attributes");
-  if (Index.withSupportsHotColdNew())
-    return;
-
-  // The profile matcher applies hotness attributes directly for allocations,
-  // and those will cause us to generate calls to the hot/cold interfaces
-  // unconditionally. If supports-hot-cold-new was not enabled in the LTO
-  // link then assume we don't want these calls (e.g. not linking with
-  // the appropriate library, or otherwise trying to disable this behavior).
-  for (auto &F : Mod) {
-    for (auto &BB : F) {
-      for (auto &I : BB) {
-        auto *CI = dyn_cast<CallBase>(&I);
-        if (!CI)
-          continue;
-        if (CI->hasFnAttr("memprof"))
-          CI->removeFnAttr("memprof");
-        // Strip off all memprof metadata as it is no longer needed.
-        // Importantly, this avoids the addition of new memprof attributes
-        // after inlining propagation.
-        // TODO: If we support additional types of MemProf metadata beyond hot
-        // and cold, we will need to update the metadata based on the allocator
-        // APIs supported instead of completely stripping all.
-        CI->setMetadata(LLVMContext::MD_memprof, nullptr);
-        CI->setMetadata(LLVMContext::MD_callsite, nullptr);
-      }
-    }
-  }
-}
-
 Error LTO::runRegularLTO(AddStreamFn AddStream) {
   llvm::TimeTraceScope timeScope("Run regular LTO");
   LLVMContext &CombinedCtx = RegularLTO.CombinedModule->getContext();
@@ -1346,8 +1314,6 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
     }
   }
 
-  updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex);
-
   bool WholeProgramVisibilityEnabledInLTO =
       Conf.HasWholeProgramVisibility &&
       // If validation is enabled, upgrade visibility only when all vtables
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 11a7b32..280c3d1 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -726,7 +726,6 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   }
 
   // Do this after any importing so that imported code is updated.
-  updateMemProfAttributes(Mod, CombinedIndex);
   updatePublicTypeTestCalls(Mod, CombinedIndex.withWholeProgramVisibility());
 
   if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 421d6603..c3a27c9 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -488,6 +488,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_MACH_V5, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V55, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V60, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V61, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V62, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V65, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V66, EF_HEXAGON_MACH);
@@ -499,12 +500,21 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_MACH_V71T, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V73, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V75, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V77, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V79, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V81, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V83, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V85, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V87, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V89, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V91, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V5, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V55, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V60, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V61, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V62, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V65, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V66, EF_HEXAGON_ISA);
@@ -514,6 +524,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_ISA_V71, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V73, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V75, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V77, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V79, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V81, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V83, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V85, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V87, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V89, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V91, EF_HEXAGON_ISA);
     break;
   case ELF::EM_AVR:
     BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK);
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index fea0d25..3f3939eaf 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1658,6 +1658,16 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
 
   ModulePassManager MPM;
 
+  // Currently this pipeline is only invoked in an LTO pre link pass or when we
+  // are not running LTO. If that changes the below checks may need updating.
+  assert(isLTOPreLink(Phase) || Phase == ThinOrFullLTOPhase::None);
+
+  // If we are invoking this in non-LTO mode, remove any MemProf related
+  // attributes and metadata, as we don't know whether we are linking with
+  // a library containing the necessary interfaces.
+  if (Phase == ThinOrFullLTOPhase::None)
+    MPM.addPass(MemProfRemoveInfo());
+
   // Convert @llvm.global.annotations to !annotation metadata.
   MPM.addPass(Annotation2MetadataPass());
 
@@ -1803,6 +1813,12 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
     OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
   ModulePassManager MPM;
 
+  // If we are invoking this without a summary index noting that we are linking
+  // with a library containing the necessary APIs, remove any MemProf related
+  // attributes and metadata.
+  if (!ImportSummary || !ImportSummary->withSupportsHotColdNew())
+    MPM.addPass(MemProfRemoveInfo());
+
   if (ImportSummary) {
     // For ThinLTO we must apply the context disambiguation decisions early, to
     // ensure we can correctly match the callsites to summary data.
@@ -1874,6 +1890,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level);
 
+  // If we are invoking this without a summary index noting that we are linking
+  // with a library containing the necessary APIs, remove any MemProf related
+  // attributes and metadata.
+  if (!ExportSummary || !ExportSummary->withSupportsHotColdNew())
+    MPM.addPass(MemProfRemoveInfo());
+
   // Create a function that performs CFI checks for cross-DSO calls with targets
   // in the current module.
   MPM.addPass(CrossDSOCFIPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1b16525..884d8da 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -113,6 +113,7 @@ MODULE_PASS("pgo-force-function-attrs",
                                           ? PGOOpt->ColdOptType
                                           : PGOOptions::ColdFuncOpt::Default))
 MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
+MODULE_PASS("memprof-remove-attributes", MemProfRemoveInfo())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
 MODULE_PASS("metarenamer", MetaRenamerPass())
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index dad0fa3..648d6a5 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -354,8 +354,8 @@ namespace llvm {
 /// Emulates hitting "retry" from an "abort, retry, ignore" CRT debug report
 /// dialog. "retry" raises an exception which ultimately triggers our stack
 /// dumper.
-static LLVM_ATTRIBUTE_UNUSED int
-AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
+[[maybe_unused]] static int AvoidMessageBoxHook(int ReportType, char *Message,
+                                                int *Return) {
   // Set *Return to the retry code for the return value of _CrtDbgReport:
   // http://msdn.microsoft.com/en-us/library/8hyw4sy7(v=vs.71).aspx
   // This may also trigger just-in-time debugging via DebugBreak().
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 91c1f59..662d84b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18176,8 +18176,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
-  if (Factor != 2 && Factor != 4) {
-    LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
+    LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
     return false;
   }
   auto *LI = dyn_cast<LoadInst>(Load);
@@ -18255,8 +18255,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     Instruction *Store, Value *Mask,
     ArrayRef<Value *> InterleavedValues) const {
   unsigned Factor = InterleavedValues.size();
-  if (Factor != 2 && Factor != 4) {
-    LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
+    LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
     return false;
   }
   StoreInst *SI = dyn_cast<StoreInst>(Store);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 30dfcf2b..12c600f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10600,6 +10600,9 @@ describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
   Register DestReg = DestSrc->Destination->getReg();
   Register SrcReg = DestSrc->Source->getReg();
 
+  if (!DestReg.isValid() || !SrcReg.isValid())
+    return std::nullopt;
+
   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
 
   // If the described register is the destination, just return the source.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80..a67b12a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
+  addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
   addRulesForGOpcs({G_FADD}, Standard)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 58482ea..9fbf9e5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,12 @@ static cl::opt<bool> GCNTrackers(
     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
     cl::init(false));
 
+static cl::opt<unsigned> PendingQueueLimit(
+    "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
+    cl::desc(
+        "Max (Available+Pending) size to inspect pending queue (0 disables)"),
+    cl::init(256));
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 #define DUMP_MAX_REG_PRESSURE
 static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
@@ -335,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   }
 }
 
+static bool shouldCheckPending(SchedBoundary &Zone,
+                               const TargetSchedModel *SchedModel) {
+  bool HasBufferedModel =
+      SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
+  unsigned Combined = Zone.Available.size() + Zone.Pending.size();
+  return Combined <= PendingQueueLimit && HasBufferedModel;
+}
+
+static SUnit *pickOnlyChoice(SchedBoundary &Zone,
+                             const TargetSchedModel *SchedModel) {
+  // pickOnlyChoice() releases pending instructions and checks for new hazards.
+  SUnit *OnlyChoice = Zone.pickOnlyChoice();
+  if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
+    return OnlyChoice;
+
+  return nullptr;
+}
+
+void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
+                                              const SchedCandidate &Preferred) {
+  LLVM_DEBUG({
+    dbgs() << "Prefer:\t\t";
+    DAG->dumpNode(*Preferred.SU);
+
+    if (Current.SU) {
+      dbgs() << "Not:\t";
+      DAG->dumpNode(*Current.SU);
+    }
+
+    dbgs() << "Reason:\t\t";
+    traceCandidate(Preferred);
+  });
+}
+
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeFromQueue()
 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
-                                         SchedCandidate &Cand,
+                                         SchedCandidate &Cand, bool &IsPending,
                                          bool IsBottomUp) {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
+  IsPending = false;
   if (DAG->isTrackingPressure()) {
     if (!GCNTrackers) {
       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -358,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       VGPRPressure = T->getPressure().getArchVGPRNum();
     }
   }
-  ReadyQueue &Q = Zone.Available;
-  for (SUnit *SU : Q) {
+  LLVM_DEBUG(dbgs() << "Available Q:\n");
+  ReadyQueue &AQ = Zone.Available;
+  for (SUnit *SU : AQ) {
 
     SchedCandidate TryCand(ZonePolicy);
     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
@@ -371,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       // Initialize resource delta if needed in case future heuristics query it.
       if (TryCand.ResDelta == SchedResourceDelta())
         TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
       Cand.setBest(TryCand);
-      LLVM_DEBUG(traceCandidate(Cand));
+    } else {
+      printCandidateDecision(TryCand, Cand);
+    }
+  }
+
+  if (!shouldCheckPending(Zone, SchedModel))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Pending Q:\n");
+  ReadyQueue &PQ = Zone.Pending;
+  for (SUnit *SU : PQ) {
+
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+                  VGPRPressure, IsBottomUp);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    tryPendingCandidate(Cand, TryCand, ZoneArg);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+      IsPending = true;
+      Cand.setBest(TryCand);
+    } else {
+      printCandidateDecision(TryCand, Cand);
     }
   }
 }
 
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeBidirectional()
-SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
+                                               bool &PickedPending) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
-  if (SUnit *SU = Bot.pickOnlyChoice()) {
+  if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
     IsTopNode = false;
     return SU;
   }
-  if (SUnit *SU = Top.pickOnlyChoice()) {
+  if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
     IsTopNode = true;
     return SU;
   }
-  // Set the bottom-up policy based on the state of the current bottom zone and
-  // the instructions outside the zone, including the top zone.
+  // Set the bottom-up policy based on the state of the current bottom zone
+  // and the instructions outside the zone, including the top zone.
   CandPolicy BotPolicy;
   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
   // Set the top-down policy based on the state of the current top zone and
@@ -399,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   CandPolicy TopPolicy;
   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
 
+  bool BotPending = false;
   // See if BotCand is still valid (because we previously scheduled from Top).
   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
       BotCand.Policy != BotPolicy) {
     BotCand.reset(CandPolicy());
     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+                      BotPending,
                       /*IsBottomUp=*/true);
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
@@ -414,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+                        BotPending,
                         /*IsBottomUp=*/true);
       assert(TCand.SU == BotCand.SU &&
              "Last pick result should correspond to re-picking right now");
@@ -421,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
 #endif
   }
 
+  bool TopPending = false;
   // Check if the top Q has a better candidate.
   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
       TopCand.Policy != TopPolicy) {
     TopCand.reset(CandPolicy());
     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+                      TopPending,
                       /*IsBottomUp=*/false);
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
@@ -436,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+                        TopPending,
                         /*IsBottomUp=*/false);
       assert(TCand.SU == TopCand.SU &&
              "Last pick result should correspond to re-picking right now");
@@ -446,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
-  SchedCandidate Cand = BotCand;
-  TopCand.Reason = NoCand;
-  tryCandidate(Cand, TopCand, nullptr);
-  if (TopCand.Reason != NoCand) {
-    Cand.setBest(TopCand);
+  SchedCandidate Cand = BotPending ? TopCand : BotCand;
+  SchedCandidate TryCand = BotPending ? BotCand : TopCand;
+  PickedPending = BotPending && TopPending;
+
+  TryCand.Reason = NoCand;
+  if (BotPending || TopPending) {
+    PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
+  } else {
+    tryCandidate(Cand, TryCand, nullptr);
+  }
+
+  if (TryCand.Reason != NoCand) {
+    Cand.setBest(TryCand);
   }
+
   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 
   IsTopNode = Cand.AtTop;
@@ -466,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }
+  bool PickedPending;
   SUnit *SU;
   do {
+    PickedPending = false;
     if (RegionPolicy.OnlyTopDown) {
-      SU = Top.pickOnlyChoice();
+      SU = pickOnlyChoice(Top, SchedModel);
       if (!SU) {
         CandPolicy NoPolicy;
         TopCand.reset(NoPolicy);
         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+                          PickedPending,
                           /*IsBottomUp=*/false);
         assert(TopCand.Reason != NoCand && "failed to find a candidate");
         SU = TopCand.SU;
       }
       IsTopNode = true;
     } else if (RegionPolicy.OnlyBottomUp) {
-      SU = Bot.pickOnlyChoice();
+      SU = pickOnlyChoice(Bot, SchedModel);
       if (!SU) {
         CandPolicy NoPolicy;
         BotCand.reset(NoPolicy);
         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+                          PickedPending,
                           /*IsBottomUp=*/true);
         assert(BotCand.Reason != NoCand && "failed to find a candidate");
         SU = BotCand.SU;
       }
       IsTopNode = false;
     } else {
-      SU = pickNodeBidirectional(IsTopNode);
+      SU = pickNodeBidirectional(IsTopNode, PickedPending);
     }
   } while (SU->isScheduled);
 
+  if (PickedPending) {
+    unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
+    SchedBoundary &Zone = IsTopNode ? Top : Bot;
+    unsigned CurrentCycle = Zone.getCurrCycle();
+    if (ReadyCycle > CurrentCycle)
+      Zone.bumpCycle(ReadyCycle);
+
+    // FIXME: checkHazard() doesn't give information about which cycle the
+    // hazard will resolve so just keep bumping the cycle by 1. This could be
+    // made more efficient if checkHazard() returned more details.
+    while (Zone.checkHazard(SU))
+      Zone.bumpCycle(Zone.getCurrCycle() + 1);
+
+    Zone.releasePending();
+  }
+
   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
@@ -540,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
   return *std::next(CurrentStage);
 }
 
+bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+                                           SchedCandidate &TryCand,
+                                           SchedBoundary *Zone) const {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return TryCand.Reason != NoCand;
+
+  // Avoid exceeding the target's limit.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+                  RegExcess, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+                  TryCand, Cand, RegCritical, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return TryCand.Reason != NoCand;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources, TryCand, Cand,
+                   ResourceDemand))
+      return TryCand.Reason != NoCand;
+  }
+
+  return false;
+}
+
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8ea4267..975781f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
 /// heuristics to determine excess/critical pressure sets.
 class GCNSchedStrategy : public GenericScheduler {
 protected:
-  SUnit *pickNodeBidirectional(bool &IsTopNode);
+  SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending);
 
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
-                         SchedCandidate &Cand, bool IsBottomUp);
+                         SchedCandidate &Cand, bool &IsPending,
+                         bool IsBottomUp);
 
   void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
                      const RegPressureTracker &RPTracker,
                      const SIRegisterInfo *SRI, unsigned SGPRPressure,
                      unsigned VGPRPressure, bool IsBottomUp);
 
+  /// Evaluates instructions in the pending queue using a subset of scheduling
+  /// heuristics.
+  ///
+  /// Instructions that cannot be issued due to hardware constraints are placed
+  /// in the pending queue rather than the available queue, making them normally
+  /// invisible to scheduling heuristics. However, in certain scenarios (such as
+  /// avoiding register spilling), it may be beneficial to consider scheduling
+  /// these not-yet-ready instructions.
+  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                           SchedBoundary *Zone) const;
+
+  void printCandidateDecision(const SchedCandidate &Current,
+                              const SchedCandidate &Preferred);
+
   std::vector<unsigned> Pressure;
 
   std::vector<unsigned> MaxPressure;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e979eeb..df27ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -879,6 +879,11 @@ public:
            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   }
 
+  bool isMFMA(uint16_t Opcode) const {
+    return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+  }
+
   static bool isDOT(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
   }
@@ -895,6 +900,10 @@ public:
     return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
   }
 
+  bool isMFMAorWMMA(uint16_t Opcode) const {
+    return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+  }
+
   static bool isSWMMAC(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
   }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a01a5fd..5e3195b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1697,9 +1697,6 @@ LLVM_READNONE
 bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
-bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
-
-LLVM_READNONE
 bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi);
 
 LLVM_READNONE
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 1ce8d7e3..df0c8c1 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -264,9 +264,10 @@ public:
 
 } // end anonymous namespace
 
-static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
-                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                     CCValAssign::LocInfo LocInfo,
+                                     ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                     CCState &State);
 
 static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
                             CCValAssign::LocInfo LocInfo,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7f1ff45..2fd7327 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -3176,9 +3176,10 @@ static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
                     F64Regs);
 }
 
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
-                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                        CCState &State);
 
 #include "MipsGenCallingConv.inc"
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 272c21f..2f1a7ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -749,7 +749,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
 
-  // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+  // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
   // expansion for these nodes when they are unaligned is incorrect if the
   // type is a vector.
   //
@@ -757,7 +757,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   //       TargetLowering::expandUnalignedLoad/Store.
   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
                    MVT::v2i8, Expand);
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
+                   {MVT::v2i8, MVT::v2i16}, Expand);
   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
 
   // Register custom handling for illegal type loads/stores. We'll try to custom
   // lower almost all illegal types and logic in the lowering will discard cases
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7123a2d..eb87558 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1672,6 +1672,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.useRVVForFixedLengthVectors())
     setTargetDAGCombine(ISD::BITCAST);
 
+  setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
+
   // Disable strict node mutation.
   IsStrictFPEnabled = true;
   EnableExtLdPromotion = true;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index eedfdb3..ed54404d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1763,6 +1763,26 @@ defm RELAXED_DOT :
             "i16x8.relaxed_dot_i8x16_i7x16_s\t$dst, $lhs, $rhs",
             "i16x8.relaxed_dot_i8x16_i7x16_s", 0x112>;
 
+def : Pat<
+  (v8i16 (add
+    (wasm_shuffle
+      (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+      (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+      (i32 0), (i32 1), (i32 4), (i32 5),
+      (i32 8), (i32 9), (i32 12), (i32 13),
+      (i32 16), (i32 17), (i32 20), (i32 21),
+      (i32 24), (i32 25), (i32 28), (i32 29)),
+    (wasm_shuffle
+      (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+      (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+      (i32 2), (i32 3), (i32 6), (i32 7),
+      (i32 10), (i32 11), (i32 14), (i32 15),
+      (i32 18), (i32 19), (i32 22), (i32 23),
+      (i32 26), (i32 27), (i32 30), (i32 31)))
+  ),
+  (v8i16 (RELAXED_DOT v16i8:$lhs, v16i8:$rhs))
+>;
+
 defm RELAXED_DOT_ADD :
   RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, V128:$acc),
             (outs), (ins),
@@ -1771,6 +1791,14 @@ defm RELAXED_DOT_ADD :
             "i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
             "i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
 
+def : Pat<
+  (v4i32 (add
+    (v4i32 (int_wasm_extadd_pairwise_signed
+      (v8i16 (int_wasm_relaxed_dot_i8x16_i7x16_signed v16i8:$lhs, v16i8:$rhs)))),
+    (v4i32 V128:$acc))),
+  (v4i32 (RELAXED_DOT_ADD v16i8:$lhs, v16i8:$rhs, (v4i32 V128:$acc)))
+    >;
+
 def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
                                                          (v16i8 V128:$rhs))),
           (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index b81641f..28fa2cd 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -414,8 +414,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
-  getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
-
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalFor({s32, s64})
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 6db780f..8e08d16 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1338,6 +1338,10 @@ def ProcessorFeatures {
   list<SubtargetFeature> PTLFeatures =
     !listremove(ARLSFeatures, [FeatureWIDEKL]);
 
+  // Novalake
+  list<SubtargetFeature> NVLFeatures =
+      !listconcat(PTLFeatures, [FeaturePREFETCHI]);
+
   // Clearwaterforest
   list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
                                                   FeatureAVXVNNIINT16,
@@ -1883,6 +1887,9 @@ foreach P = ["pantherlake", "wildcatlake"] in {
 def : ProcModel<P, AlderlakePModel,
                 ProcessorFeatures.PTLFeatures, ProcessorFeatures.ADLTuning>;
 }
+def : ProcModel<"novalake", AlderlakePModel, ProcessorFeatures.NVLFeatures,
+                ProcessorFeatures.ADLTuning>;
+
 def : ProcModel<"clearwaterforest", AlderlakePModel,
                 ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
 def : ProcModel<"emeraldrapids", SapphireRapidsModel,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 3479106..6065575 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1152,6 +1152,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
       break;
     }
     break;
+  case 0x12:
+    switch (Model) {
+    // Novalake:
+    case 0x1:
+    case 0x3:
+      CPU = "novalake";
+      *Type = X86::INTEL_COREI7;
+      *Subtype = X86::INTEL_COREI7_NOVALAKE;
+      break;
+    default: // Unknown family 0x12 CPU.
+      break;
+    }
+    break;
+
   default:
     break; // Unknown.
   }
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 9268df2..31126cc 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -887,7 +887,7 @@ void RISCVISAInfo::updateImplication() {
 }
 
 static constexpr StringLiteral CombineIntoExts[] = {
-    {"b"},     {"zk"},    {"zkn"},  {"zks"},   {"zvkn"},
+    {"a"},     {"b"},     {"zk"},   {"zkn"},   {"zks"},   {"zvkn"},
     {"zvknc"}, {"zvkng"}, {"zvks"}, {"zvksc"}, {"zvksg"},
 };
 
diff --git a/llvm/lib/TargetParser/Unix/Host.inc b/llvm/lib/TargetParser/Unix/Host.inc
index aeb2f59..38b942d 100644
--- a/llvm/lib/TargetParser/Unix/Host.inc
+++ b/llvm/lib/TargetParser/Unix/Host.inc
@@ -59,10 +59,30 @@ static std::string updateTripleOSVersion(std::string TargetTripleString) {
     if (TT.getOS() == Triple::AIX && !TT.getOSMajorVersion()) {
       struct utsname name;
       if (uname(&name) != -1) {
+        std::string release = name.release;
+
+        if (strcmp(name.sysname, "OS400") == 0) {
+          /*
+            PASE uses different versioning system than AIX.
+            The following table shows the currently supported PASE
+            releases and the corresponding AIX release:
+            --------------------------
+              PASE    |    AIX
+            --------------------------
+              V7R4    |    7.2 (TL2)
+            --------------------------
+              V7R5    |    7.2 (TL5)
+            --------------------------
+              V7R6    |    7.3 (TL1)
+            --------------------------
+          */
+          release = (release == "4" || release == "5") ? "2" : "3";
+        }
+
         std::string NewOSName = std::string(Triple::getOSTypeName(Triple::AIX));
         NewOSName += name.version;
         NewOSName += '.';
-        NewOSName += name.release;
+        NewOSName += release;
         NewOSName += ".0.0";
         TT.setOSName(NewOSName);
         return TT.str();
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index e382cfe..dd13ce3 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -176,6 +176,8 @@ constexpr FeatureBitset FeaturesArrowlakeS =
     FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
     (FeaturesArrowlakeS ^ FeatureWIDEKL);
+constexpr FeatureBitset FeaturesNovalake =
+    FeaturesPantherlake | FeaturePREFETCHI;
 constexpr FeatureBitset FeaturesClearwaterforest =
     (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
     FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
@@ -379,6 +381,8 @@ constexpr ProcInfo Processors[] = {
   // Pantherlake microarchitecture based processors.
   { {"pantherlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
   { {"wildcatlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
+  // Novalake microarchitecture based processors.
+  { {"novalake"}, CK_Novalake, FEATURE_AVX2, FeaturesNovalake, 'r', false },
   // Sierraforest microarchitecture based processors.
   { {"sierraforest"}, CK_Sierraforest, FEATURE_AVX2, FeaturesSierraforest, 'p', false },
   // Grandridge microarchitecture based processors.
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index f166fef..cf7e450 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -153,26 +153,23 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
       bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
       bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
       if (IsCallerPresplitCoroutine && HasAttr) {
-        BranchProbability MinBranchProbability(
-            static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
-            MinBlockCounterExecution);
-
         auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
 
-        auto Prob = BranchProbability::getBranchProbability(
-            BFI.getBlockFreq(CB->getParent()).getFrequency(),
-            BFI.getEntryFreq().getFrequency());
+        auto BlockFreq = BFI.getBlockFreq(CB->getParent()).getFrequency();
+        auto EntryFreq = BFI.getEntryFreq().getFrequency();
+        uint64_t MinFreq =
+            static_cast<uint64_t>(EntryFreq * CoroElideBranchRatio);
 
-        if (Prob < MinBranchProbability) {
+        if (BlockFreq < MinFreq) {
           ORE.emit([&]() {
             return OptimizationRemarkMissed(
                        DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
                    << "'" << ore::NV("callee", Callee->getName())
                    << "' not elided in '"
                    << ore::NV("caller", Caller->getName())
-                   << "' because of low probability: "
-                   << ore::NV("probability", Prob) << " (threshold: "
-                   << ore::NV("threshold", MinBranchProbability) << ")";
+                   << "' because of low frequency: "
+                   << ore::NV("block_freq", BlockFreq)
+                   << " (threshold: " << ore::NV("min_freq", MinFreq) << ")";
           });
           continue;
         }
@@ -188,7 +185,8 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
           return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
                  << "'" << ore::NV("callee", Callee->getName())
                  << "' elided in '" << ore::NV("caller", Caller->getName())
-                 << "' (probability: " << ore::NV("probability", Prob) << ")";
+                 << "' (block_freq: " << ore::NV("block_freq", BlockFreq)
+                 << ")";
         });
 
         FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5066a99..894d83f 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -6150,3 +6150,42 @@ void MemProfContextDisambiguation::run(
   IndexCallsiteContextGraph CCG(Index, isPrevailing);
   CCG.process();
 }
+
+// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+// when we don't have an index that has recorded that we are linking with
+// allocation libraries containing the necessary APIs for downstream
+// transformations.
+PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
+  // The profile matcher applies hotness attributes directly for allocations,
+  // and those will cause us to generate calls to the hot/cold interfaces
+  // unconditionally. If supports-hot-cold-new was not enabled in the LTO
+  // link then assume we don't want these calls (e.g. not linking with
+  // the appropriate library, or otherwise trying to disable this behavior).
+  bool Changed = false;
+  for (auto &F : M) {
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        auto *CI = dyn_cast<CallBase>(&I);
+        if (!CI)
+          continue;
+        if (CI->hasFnAttr("memprof")) {
+          CI->removeFnAttr("memprof");
+          Changed = true;
+        }
+        if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
+          assert(!CI->hasMetadata(LLVMContext::MD_memprof));
+          continue;
+        }
+        // Strip off all memprof metadata as it is no longer needed.
+        // Importantly, this avoids the addition of new memprof attributes
+        // after inlining propagation.
+        CI->setMetadata(LLVMContext::MD_memprof, nullptr);
+        CI->setMetadata(LLVMContext::MD_callsite, nullptr);
+        Changed = true;
+      }
+    }
+  }
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 511bca4..6e17801 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -605,17 +605,16 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
   return Mapping;
 }
 
-namespace llvm {
-void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
-                               bool IsKasan, uint64_t *ShadowBase,
-                               int *MappingScale, bool *OrShadowOffset) {
+void llvm::getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+                                     bool IsKasan, uint64_t *ShadowBase,
+                                     int *MappingScale, bool *OrShadowOffset) {
   auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan);
   *ShadowBase = Mapping.Offset;
   *MappingScale = Mapping.Scale;
   *OrShadowOffset = Mapping.OrShadowOffset;
 }
 
-void removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
+void llvm::removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
   // Sanitizer checks read from shadow, which invalidates memory(argmem: *).
   //
   // This is not only true for sanitized functions, because AttrInfer can
@@ -668,8 +667,6 @@ ASanAccessInfo::ASanAccessInfo(bool IsWrite, bool CompileKernel,
       AccessSizeIndex(AccessSizeIndex), IsWrite(IsWrite),
       CompileKernel(CompileKernel) {}
 
-} // namespace llvm
-
 static uint64_t getRedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
@@ -677,11 +674,10 @@ static uint64_t getRedzoneSizeForScale(int MappingScale) {
 }
 
 static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
-  if (TargetTriple.isOSEmscripten()) {
+  if (TargetTriple.isOSEmscripten())
     return kAsanEmscriptenCtorAndDtorPriority;
-  } else {
+  else
     return kAsanCtorAndDtorPriority;
-  }
 }
 
 static Twine genName(StringRef suffix) {
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 444b390..72e8e50 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -2092,8 +2092,6 @@ bool CHR::run() {
   return Changed;
 }
 
-namespace llvm {
-
 ControlHeightReductionPass::ControlHeightReductionPass() {
   parseCHRFilterFiles();
 }
@@ -2116,5 +2114,3 @@ PreservedAnalyses ControlHeightReductionPass::run(
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index c327311..7ebcc21 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -53,6 +53,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
@@ -117,6 +118,10 @@ static cl::opt<bool>
 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
                 cl::desc("Predicate conditions in read only loops"));
 
+static cl::opt<bool> LoopPredicationTraps(
+    "indvars-predicate-loop-traps", cl::Hidden, cl::init(true),
+    cl::desc("Predicate conditions that trap in loops with only local writes"));
+
 static cl::opt<bool>
 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
                 cl::desc("Allow widening of indvars to eliminate s/zext"));
@@ -1704,6 +1709,24 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   return Changed;
 }
 
+static bool crashingBBWithoutEffect(const BasicBlock &BB) {
+  return llvm::all_of(BB, [](const Instruction &I) {
+    // TODO: for now this is overly restrictive, to make sure nothing in this
+    // BB can depend on the loop body.
+    // It's not enough to check for !I.mayHaveSideEffects(), because e.g. a
+    // load does not have a side effect, but we could have
+    // %a = load ptr, ptr %ptr
+    // %b = load i32, ptr %a
+    // Now if the loop stored a non-nullptr to %a, we could cause a nullptr
+    // dereference by skipping over loop iterations.
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->onlyAccessesInaccessibleMemory())
+        return true;
+    }
+    return isa<UnreachableInst>(I);
+  });
+}
+
 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
@@ -1816,11 +1839,25 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // suggestions on how to improve this?  I can obviously bail out for outer
   // loops, but that seems less than ideal.  MemorySSA can find memory writes,
   // is that enough for *all* side effects?
+  bool HasThreadLocalSideEffects = false;
   for (BasicBlock *BB : L->blocks())
     for (auto &I : *BB)
       // TODO:isGuaranteedToTransfer
-      if (I.mayHaveSideEffects())
-        return false;
+      if (I.mayHaveSideEffects()) {
+        if (!LoopPredicationTraps)
+          return false;
+        HasThreadLocalSideEffects = true;
+        if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+          // Simple stores cannot be observed by other threads.
+          // If HasThreadLocalSideEffects is set, we check
+          // crashingBBWithoutEffect to make sure that the crashing BB cannot
+          // observe them either.
+          if (!SI->isSimple())
+            return false;
+        } else {
+          return false;
+        }
+      }
 
   bool Changed = false;
   // Finally, do the actual predication for all predicatable blocks.  A couple
@@ -1840,6 +1877,19 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
 
     auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    if (HasThreadLocalSideEffects) {
+      const BasicBlock *Unreachable = nullptr;
+      for (const BasicBlock *Succ : BI->successors()) {
+        if (isa<UnreachableInst>(Succ->getTerminator()))
+          Unreachable = Succ;
+      }
+      // Exit BB which have one branch back into the loop and another one to
+      // a trap can still be optimized, because local side effects cannot
+      // be observed in the exit case (the trap). We could be smarter about
+      // this, but for now lets pattern match common cases that directly trap.
+      if (Unreachable == nullptr || !crashingBBWithoutEffect(*Unreachable))
+        return Changed;
+    }
     Value *NewCond;
     if (ExitCount == ExactBTC) {
       NewCond = L->contains(BI->getSuccessor(0)) ?
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 20733032..19eccb9 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -368,7 +368,7 @@ private:
     Valid = false;
   }
 
-  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+  bool reportInvalidCandidate(Statistic &Stat) const {
     using namespace ore;
     assert(L && Preheader && "Fusion candidate not initialized properly!");
 #if LLVM_ENABLE_STATS
@@ -445,6 +445,7 @@ struct FusionCandidateCompare {
         "No dominance relationship between these fusion candidates!");
   }
 };
+} // namespace
 
 using LoopVector = SmallVector<Loop *, 4>;
 
@@ -461,9 +462,15 @@ using LoopVector = SmallVector<Loop *, 4>;
 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
 
-#if !defined(NDEBUG)
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                                     const FusionCandidate &FC) {
+#ifndef NDEBUG
+static void printLoopVector(const LoopVector &LV) {
+  dbgs() << "****************************\n";
+  for (const Loop *L : LV)
+    printLoop(*L, dbgs());
+  dbgs() << "****************************\n";
+}
+
+static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) {
   if (FC.isValid())
     OS << FC.Preheader->getName();
   else
@@ -472,8 +479,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
   return OS;
 }
 
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                                     const FusionCandidateSet &CandSet) {
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const FusionCandidateSet &CandSet) {
   for (const FusionCandidate &FC : CandSet)
     OS << FC << '\n';
 
@@ -489,7 +496,9 @@ printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
     dbgs() << "****************************\n";
   }
 }
-#endif
+#endif // NDEBUG
+
+namespace {
 
 /// Collect all loops in function at the same nest level, starting at the
 /// outermost level.
@@ -550,15 +559,6 @@ private:
   LoopsOnLevelTy LoopsOnLevel;
 };
 
-#ifndef NDEBUG
-static void printLoopVector(const LoopVector &LV) {
-  dbgs() << "****************************\n";
-  for (auto *L : LV)
-    printLoop(*L, dbgs());
-  dbgs() << "****************************\n";
-}
-#endif
-
 struct LoopFuser {
 private:
   // Sets of control flow equivalent fusion candidates for a given nest level.
@@ -1850,7 +1850,7 @@ private:
   ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
   template <typename RemarkKind>
   void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
-                        llvm::Statistic &Stat) {
+                        Statistic &Stat) {
     assert(FC0.Preheader && FC1.Preheader &&
            "Expecting valid fusion candidates");
     using namespace ore;
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 32078b1..7da8586 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -16,8 +16,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-
 /// Explicitly specialize the pass manager's run method to handle loop nest
 /// structure updates.
 PreservedAnalyses
@@ -185,7 +183,6 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
   }
   return PA;
 }
-} // namespace llvm
 
 void FunctionToLoopPassAdaptor::printPipeline(
     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
@@ -193,6 +190,7 @@ void FunctionToLoopPassAdaptor::printPipeline(
   Pass->printPipeline(OS, MapClassName2PassName);
   OS << ')';
 }
+
 PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   // Before we even compute any loop analyses, first run a miniature function
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 448dc2b..f3e6cbf 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -540,8 +540,6 @@ bool LoopVersioningLICM::run(DominatorTree *DT) {
   return Changed;
 }
 
-namespace llvm {
-
 PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &LAR,
                                               LPMUpdater &U) {
@@ -556,4 +554,3 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 80aa98d..5a8f18a 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -160,9 +160,6 @@ static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
 //===----------------------------------------------------------------------===//
 
 // Anchor methods.
-namespace llvm {
-namespace GVNExpression {
-
 Expression::~Expression() = default;
 BasicExpression::~BasicExpression() = default;
 CallExpression::~CallExpression() = default;
@@ -171,9 +168,6 @@ StoreExpression::~StoreExpression() = default;
 AggregateValueExpression::~AggregateValueExpression() = default;
 PHIExpression::~PHIExpression() = default;
 
-} // end namespace GVNExpression
-} // end namespace llvm
-
 namespace {
 
 // Tarjan's SCC finding algorithm with Nuutila's improvements
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index ba58b8e..6d7ce36 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -2623,32 +2623,32 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
 
 namespace {
 
-  class ReassociateLegacyPass : public FunctionPass {
-    ReassociatePass Impl;
+class ReassociateLegacyPass : public FunctionPass {
+  ReassociatePass Impl;
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-    ReassociateLegacyPass() : FunctionPass(ID) {
-      initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
+  ReassociateLegacyPass() : FunctionPass(ID) {
+    initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override {
-      if (skipFunction(F))
-        return false;
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
 
-      FunctionAnalysisManager DummyFAM;
-      auto PA = Impl.run(F, DummyFAM);
-      return !PA.areAllPreserved();
-    }
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addPreserved<AAResultsWrapperPass>();
-      AU.addPreserved<BasicAAWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-    }
-  };
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
 
 } // end anonymous namespace
 
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index aae5d60..25a531c 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -50,9 +50,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "scalarizer"
 
-namespace {
-
-BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
+static BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
   BasicBlock *BB = Itr->getParent();
   if (isa<PHINode>(Itr))
     Itr = BB->getFirstInsertionPt();
@@ -76,6 +74,8 @@ using ScatterMap = std::map<std::pair<Value *, Type *>, ValueVector>;
 // along with a pointer to their scattered forms.
 using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
 
+namespace {
+
 struct VectorSplit {
   // The type of the vector.
   FixedVectorType *VecTy = nullptr;
@@ -196,6 +196,7 @@ struct VectorLayout {
   // The size of each (non-remainder) fragment in bytes.
   uint64_t SplitSize = 0;
 };
+} // namespace
 
 static bool isStructOfMatchingFixedVectors(Type *Ty) {
   if (!isa<StructType>(Ty))
@@ -268,6 +269,7 @@ static Value *concatenate(IRBuilder<> &Builder, ArrayRef<Value *> Fragments,
   return Res;
 }
 
+namespace {
 class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 public:
   ScalarizerVisitor(DominatorTree *DT, const TargetTransformInfo *TTI,
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index ebcbd2b..fa66a03 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -149,8 +149,6 @@ bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
   return Impl.runImpl(F, TTI);
 }
 
-namespace llvm {
-
 bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
   if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence(&F)) {
     LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
@@ -328,11 +326,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   return true;
 }
 
-FunctionPass *createSpeculativeExecutionPass() {
+FunctionPass *llvm::createSpeculativeExecutionPass() {
   return new SpeculativeExecutionLegacyPass();
 }
 
-FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+FunctionPass *llvm::createSpeculativeExecutionIfHasBranchDivergencePass() {
   return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
 }
 
@@ -362,4 +360,3 @@ void SpeculativeExecutionPass::printPipeline(
     OS << "only-if-divergent-target";
   OS << '>';
 }
-}  // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 7d01709..e94ad19 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -716,8 +716,6 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
   return Ret;
 }
 
-namespace llvm {
-
 PreservedAnalyses
 StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
   const DataLayout *DL = &F.getDataLayout();
@@ -735,5 +733,3 @@ StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserve<TargetIRAnalysis>();
   return PA;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 1d83ddc..89d41f3e 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -192,7 +192,7 @@ struct AllocaDerivedValueTracker {
   SmallPtrSet<Instruction *, 32> AllocaUsers;
   SmallPtrSet<Instruction *, 32> EscapePoints;
 };
-}
+} // namespace
 
 static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
   if (F.callsFunctionThatReturnsTwice())
@@ -967,7 +967,7 @@ struct TailCallElim : public FunctionPass {
         /*BFI=*/nullptr);
   }
 };
-}
+} // namespace
 
 char TailCallElim::ID = 0;
 INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7f5a41c..cae9aee8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2124,6 +2124,11 @@ static void licm(VPlan &Plan) {
   // Return true if we do not know how to (mechanically) hoist a given recipe
   // out of a loop region.
   auto CannotHoistRecipe = [](VPRecipeBase &R) {
+    // Assumes don't alias anything or throw; as long as they're guaranteed to
+    // execute, they're safe to hoist.
+    if (match(&R, m_Intrinsic<Intrinsic::assume>()))
+      return false;
+
     // TODO: Relax checks in the future, e.g. we could also hoist reads, if
     // their memory location is not modified in the vector loop.
     if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
index aeeb21e..ab1945d 100644
--- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
+++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
@@ -150,7 +150,7 @@ define void @test_typedbuffer() {
   ; CHECK:   Kind: CBuffer
   ; CHECK:   CBuffer size: 4
 
-  %cb1 = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cb1 = call target("dx.CBuffer", <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>)
      @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr @Constants.str)
   ; CHECK: Resource [[CB1:[0-9]+]]:
   ; CHECK:   Name: Constants
@@ -161,7 +161,7 @@ define void @test_typedbuffer() {
   ; CHECK:     Size: 1
   ; CHECK:   Class: CBV
   ; CHECK:   Kind: CBuffer
-  ; CHECK:   CBuffer size: 4
+  ; CHECK:   CBuffer size: 36
 
   ; CHECK-NOT: Resource {{[0-9]+}}:
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
index a0f1b71..bb362d2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines
 
-define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) {
+define void @zero_cycle_regmove_FPR64(double %a, double %b, double %c, double %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOZCM-FPR128-CPU: fmov d0, d2
@@ -45,7 +45,7 @@ entry:
 
 declare float @foo_double(double, double)
 
-define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
+define void @zero_cycle_regmove_FPR32(float %a, float %b, float %c, float %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOZCM-FPR128-CPU: fmov s0, s2
@@ -86,7 +86,7 @@ entry:
 
 declare float @foo_float(float, float)
 
-define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
+define void @zero_cycle_regmove_FPR16(half %a, half %b, half %c, half %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOZCM-FPR128-CPU: fmov s0, s2
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
index e14e69b..d6d3f15 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
 
-define void @zero_cycle_regmov_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
+define void @zero_cycle_regmove_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOTCPU-LINUX: mov w0, w2
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 6b09424..eee232a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -49,7 +49,6 @@ bb:
   ret void
 }
 
-; FIXME: This generates "instid1(/* invalid instid value */)".
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
 ; GFX11-LABEL: f2:
 ; GFX11:       ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index b07dec3..689d147 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,1153 +6,1147 @@
   define amdgpu_kernel void @largeInterleave() #0 { ret void }
   ; GCN-LABEL: largeInterleave:
   ; GCN:       ; %bb.0:
-  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; GCN-NEXT:    ; implicit-def: $vgpr0
-  ; GCN-NEXT:    ; implicit-def: $vgpr2
-  ; GCN-NEXT:    ; implicit-def: $vgpr1
-  ; GCN-NEXT:    ; implicit-def: $vgpr8
-  ; GCN-NEXT:    ; implicit-def: $vgpr94
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ; implicit-def: $vgpr106
-  ; GCN-NEXT:    ; implicit-def: $vgpr132
-  ; GCN-NEXT:    ; implicit-def: $vgpr133
-  ; GCN-NEXT:    ; implicit-def: $vgpr139
-  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
-  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
-  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr16
+  ; GCN-NEXT:    ; implicit-def: $vgpr25
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-  ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+  ; GCN-NEXT:    v_readfirstlane_b32 s17, v16
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
-  ; GCN-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v0, s7, 4, v2
-  ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v92, v0, v1, 1
-  ; GCN-NEXT:    v_add_u32_e32 v93, s0, v92
-  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s18, s17, 7
+  ; GCN-NEXT:    ; implicit-def: $vgpr18
+  ; GCN-NEXT:    v_add_lshl_u32 v230, v18, s18, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v25, s17, 4, v25
+  ; GCN-NEXT:    v_mul_lo_u32 v25, v25, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v226, v25, v17, 1
+  ; GCN-NEXT:    v_add_u32_e32 v17, s15, v226
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    s_lshl_b32 s0, s7, 7
-  ; GCN-NEXT:    v_add_lshl_u32 v95, v8, s0, 1
-  ; GCN-NEXT:    v_add_u32_e32 v8, 64, v93
-  ; GCN-NEXT:    ; kill: killed $vgpr8
+  ; GCN-NEXT:    v_add_u32_e32 v72, 64, v17
+  ; GCN-NEXT:    ; implicit-def: $vgpr213
+  ; GCN-NEXT:    ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155
+  ; GCN-NEXT:    ; implicit-def: $vgpr246
+  ; GCN-NEXT:    v_add_u32_e32 v188, 0x80, v17
+  ; GCN-NEXT:    ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
+  ; GCN-NEXT:    ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
+  ; GCN-NEXT:    ; implicit-def: $vgpr19
+  ; GCN-NEXT:    ; implicit-def: $vgpr26
+  ; GCN-NEXT:    ; implicit-def: $vgpr27
+  ; GCN-NEXT:    v_add_u32_e32 v227, 0xc0, v17
+  ; GCN-NEXT:    v_add_u32_e32 v231, v19, v26
+  ; GCN-NEXT:    v_add_u32_e32 v232, v19, v27
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; kill: killed $vgpr92
-  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    ; implicit-def: $vgpr28
+  ; GCN-NEXT:    ; implicit-def: $vgpr29
+  ; GCN-NEXT:    v_add_u32_e32 v233, v19, v28
+  ; GCN-NEXT:    v_add_u32_e32 v234, v19, v29
+  ; GCN-NEXT:    ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    ; implicit-def: $sgpr7
+  ; GCN-NEXT:    ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
+  ; GCN-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
+  ; GCN-NEXT:    ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
+  ; GCN-NEXT:    v_add_u32_e32 v18, s17, v20
+  ; GCN-NEXT:    v_and_b32_e32 v18, 0x1fffffff, v18
+  ; GCN-NEXT:    ; implicit-def: $sgpr16
+  ; GCN-NEXT:    v_mul_lo_u32 v18, v18, s16
+  ; GCN-NEXT:    ; implicit-def: $vgpr21
+  ; GCN-NEXT:    v_add_lshl_u32 v199, v21, v18, 1
+  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    v_lshl_add_u32 v200, v22, 1, v199
+  ; GCN-NEXT:    ; implicit-def: $vgpr23
+  ; GCN-NEXT:    v_lshl_add_u32 v201, v23, 1, v200
+  ; GCN-NEXT:    ; implicit-def: $vgpr24
+  ; GCN-NEXT:    v_lshl_add_u32 v202, v24, 1, v201
+  ; GCN-NEXT:    ; implicit-def: $vgpr16
+  ; GCN-NEXT:    ; implicit-def: $vgpr18
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
+  ; GCN-NEXT:    ; implicit-def: $vgpr24
+  ; GCN-NEXT:    v_add_u32_e32 v247, v19, v24
+  ; GCN-NEXT:    v_add_u32_e32 v248, v19, v16
+  ; GCN-NEXT:    v_add_u32_e32 v249, v19, v18
+  ; GCN-NEXT:    v_add_u32_e32 v250, v19, v20
+  ; GCN-NEXT:    ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
+  ; GCN-NEXT:    ; implicit-def: $sgpr14
+  ; GCN-NEXT:    ; implicit-def: $vgpr196
+  ; GCN-NEXT:    ; implicit-def: $sgpr12_sgpr13
+  ; GCN-NEXT:    ; implicit-def: $vgpr211
+  ; GCN-NEXT:    v_max_f32_e32 v212, v211, v211
+  ; GCN-NEXT:    ; implicit-def: $vgpr198
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr32
+  ; GCN-NEXT:    ; implicit-def: $vgpr33
+  ; GCN-NEXT:    ; implicit-def: $vgpr34
+  ; GCN-NEXT:    v_add_u32_e32 v210, v19, v34
+  ; GCN-NEXT:    v_add_u32_e32 v206, v19, v33
+  ; GCN-NEXT:    v_add_u32_e32 v205, v19, v32
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    ; implicit-def: $vgpr21
+  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    ; implicit-def: $vgpr23
+  ; GCN-NEXT:    ; implicit-def: $vgpr30
+  ; GCN-NEXT:    ; implicit-def: $vgpr31
+  ; GCN-NEXT:    v_add_u32_e32 v207, v19, v21
+  ; GCN-NEXT:    v_add_u32_e32 v208, v19, v22
+  ; GCN-NEXT:    v_add_u32_e32 v209, v19, v23
+  ; GCN-NEXT:    v_add_u32_e32 v203, v19, v30
+  ; GCN-NEXT:    v_add_u32_e32 v204, v19, v31
+  ; GCN-NEXT:    ; kill: killed $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    ; implicit-def: $vgpr197
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[0:3]
+  ; GCN-NEXT:    ds_write_b128 v230, v[64:67]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[4:7] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v230, v[68:71] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111]
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[168:171], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    ds_read_b128 v[172:175], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[176:179], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v106 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[180:183], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0x80, v93
+  ; GCN-NEXT:    ds_write_b128 v230, v[160:163]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v230, v[164:167] offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    ds_read_b128 v[188:191], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[192:195], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[164:167], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[214:217], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[218:221], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[222:225], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[168:171], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[188:191], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    ; implicit-def: $vgpr73
-  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
+  ; GCN-NEXT:    ds_write_b128 v230, v[160:163] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
+  ; GCN-NEXT:    buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
-  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr74
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
-  ; GCN-NEXT:    ; implicit-def: $vgpr75
-  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
-  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
+  ; GCN-NEXT:    v_perm_b32 v238, v162, v160, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
+  ; GCN-NEXT:    v_perm_b32 v240, v162, v160, s7
+  ; GCN-NEXT:    v_perm_b32 v242, v163, v161, s5
+  ; GCN-NEXT:    v_perm_b32 v244, v163, v161, s7
+  ; GCN-NEXT:    ds_read_b128 v[160:163], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr76
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ; implicit-def: $sgpr8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
+  ; GCN-NEXT:    v_perm_b32 v239, v174, v172, s5
+  ; GCN-NEXT:    v_perm_b32 v241, v174, v172, s7
+  ; GCN-NEXT:    v_perm_b32 v243, v175, v173, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_perm_b32 v245, v175, v173, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[218:221], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[172:175], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[160:163], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[214:217], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[176:179], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[160:163], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v230, v[226:229] offset:1024
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[64:67], v94
+  ; GCN-NEXT:    ds_read_b128 v[156:159], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[90:93], v94 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[226:229], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[180:183], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[76:79], v94 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[152:155], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[94:97], v106
+  ; GCN-NEXT:    ds_read_b128 v[230:233], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
-  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[234:237], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[156:159], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
-  ; GCN-NEXT:    v_perm_b32 v94, v102, v98, s5
-  ; GCN-NEXT:    v_perm_b32 v98, v102, v98, s8
-  ; GCN-NEXT:    v_perm_b32 v102, v103, v99, s5
-  ; GCN-NEXT:    v_perm_b32 v95, v104, v100, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
-  ; GCN-NEXT:    v_perm_b32 v96, v103, v99, s8
-  ; GCN-NEXT:    v_perm_b32 v99, v104, v100, s8
-  ; GCN-NEXT:    v_perm_b32 v103, v105, v101, s5
-  ; GCN-NEXT:    v_perm_b32 v97, v105, v101, s8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
-  ; GCN-NEXT:    s_nop 5
-  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v101, s4, v49
-  ; GCN-NEXT:    v_max3_f32 v92, v100, s6, v101
-  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v50
-  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v51
-  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
-  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v52
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
-  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v53
-  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v54
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v55
-  ; GCN-NEXT:    v_max3_f32 v84, v92, v84, v85
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v56
-  ; GCN-NEXT:    v_mul_f32_e32 v92, s4, v57
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v92
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v58
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v59
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v60
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v61
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v62
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v63
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
-  ; GCN-NEXT:    s_nop 6
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v33
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v35
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v36
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
-  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v37
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v39
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v41
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
-  ; GCN-NEXT:    v_max3_f32 v80, v84, v85, v80
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v42
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v43
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v44
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v45
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v46
-  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v47
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v17
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
-  ; GCN-NEXT:    v_max3_f32 v68, v80, v68, v69
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v76, s4, v21
-  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v76
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v25
-  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v27
-  ; GCN-NEXT:    v_max3_f32 v64, v68, v69, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v28
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v29
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v30
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v31
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v2
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v3
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v5
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v7
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v8
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v10
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v11
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v13
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v15
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    ; implicit-def: $vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $vgpr68
-  ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
-  ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
-  ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v135, v66, v65, 1
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_lshl_add_u32 v136, v66, 1, v135
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_lshl_add_u32 v137, v66, 1, v136
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
-  ; GCN-NEXT:    v_lshl_add_u32 v138, v66, 1, v137
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v135, v[94:95]
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
+  ; GCN-NEXT:    ds_write_b64 v199, v[238:239]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[98:99]
+  ; GCN-NEXT:    ds_write_b64 v200, v[240:241]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[102:103]
+  ; GCN-NEXT:    ds_write_b64 v201, v[242:243]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
-  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
-  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr65
-  ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    ds_write_b64 v202, v[244:245]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
-  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111]
+  ; GCN-NEXT:    buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
-  ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
-  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
-  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
-  ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
-  ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
-  ; GCN-NEXT:    v_exp_f32_e32 v49, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v50, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v66
-  ; GCN-NEXT:    v_fma_f32 v68, s4, v52, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v51, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_fma_f32 v69, s4, v53, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v52, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v68
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_fma_f32 v70, s4, v54, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v53, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v69
-  ; GCN-NEXT:    v_fma_f32 v71, s4, v55, -v134
-  ; GCN-NEXT:    ds_read_b128 v[140:143], v139
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v54, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
-  ; GCN-NEXT:    v_exp_f32_e32 v55, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v56, v48
-  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
-  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
-  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
-  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
-  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
-  ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
-  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
-  ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e64 v82, v82, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v83, v83, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v84, v84, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v85, v85, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v86, v86, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v87, v87, v48
-  ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
-  ; GCN-NEXT:    v_exp_f32_e32 v58, v58
-  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e64 v98, v98, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v99, v99, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v100, v100, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v101, v101, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v102, v102, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v103, v103, v48
-  ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
-  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
-  ; GCN-NEXT:    v_mul_f32_e64 v112, v112, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v113, v113, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v114, v114, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v115, v115, v48
-  ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
-  ; GCN-NEXT:    ; implicit-def: $vgpr57
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
-  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
-  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
-  ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
-  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
-  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
-  ; GCN-NEXT:    v_exp_f32_e32 v151, v33
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
-  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
-  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
-  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
-  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
-  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
-  ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
-  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
-  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
+  ; GCN-NEXT:    v_perm_b32 v188, v194, v192, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_perm_b32 v189, v220, v218, s5
+  ; GCN-NEXT:    v_perm_b32 v191, v220, v218, s7
+  ; GCN-NEXT:    v_perm_b32 v190, v194, v192, s7
+  ; GCN-NEXT:    v_perm_b32 v192, v195, v193, s5
+  ; GCN-NEXT:    v_perm_b32 v194, v195, v193, s7
+  ; GCN-NEXT:    v_perm_b32 v193, v221, v219, s5
+  ; GCN-NEXT:    v_perm_b32 v195, v221, v219, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111]
+  ; GCN-NEXT:    s_nop 9
+  ; GCN-NEXT:    v_mul_f32_e32 v213, s4, v112
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v113
+  ; GCN-NEXT:    v_max3_f32 v213, v213, s14, v218
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v114
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v115
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v116
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v117
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v118
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v119
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v120
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v121
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v122
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v123
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v124
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v125
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v127
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
+  ; GCN-NEXT:    s_nop 6
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v96
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v97
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v98
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v99
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v100
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v101
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v103
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v104
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v105
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v106
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v107
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v108
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v109
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v110
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v111
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v140, s4, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v81
+  ; GCN-NEXT:    v_max3_f32 v140, v213, v140, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v82
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v83
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v84
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v85
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v87
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v89
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v90
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v91
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v93
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v94
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v95
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v128, s4, v64
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v65
+  ; GCN-NEXT:    v_max3_f32 v128, v140, v128, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v67
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v69
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v71
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v72
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v73
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v74
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v75
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v76
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v77
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v78
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v79
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    ds_bpermute_b32 v129, v196, v128
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_max_f32_e32 v129, v129, v129
+  ; GCN-NEXT:    v_max_f32_e32 v128, v128, v129
+  ; GCN-NEXT:    ds_bpermute_b32 v129, v196, v128
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v128, v129, v128, s[12:13]
+  ; GCN-NEXT:    v_max_f32_e32 v128, v128, v128
+  ; GCN-NEXT:    v_max_f32_e32 v128, v212, v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v113, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v114, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v115, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v116, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v141, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v117, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v118, -v128
+  ; GCN-NEXT:    v_fma_f32 v112, s4, v112, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v119, -v128
+  ; GCN-NEXT:    v_fma_f32 v118, s4, v120, -v128
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v121, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v112, 0x3fb8aa3b, v112
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v120
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v122, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v114, v138
+  ; GCN-NEXT:    v_exp_f32_e32 v115, v139
+  ; GCN-NEXT:    v_exp_f32_e32 v116, v140
+  ; GCN-NEXT:    v_exp_f32_e32 v117, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v118
+  ; GCN-NEXT:    v_exp_f32_e32 v118, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v150, 0x3fb8aa3b, v120
+  ; GCN-NEXT:    v_exp_f32_e32 v120, v144
+  ; GCN-NEXT:    v_exp_f32_e32 v113, v112
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v119, v114
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v121, v116
+  ; GCN-NEXT:    v_sub_f32_e32 v129, v211, v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v113
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v129
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v122, s4, v123, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v112, v119
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v115
+  ; GCN-NEXT:    v_mul_f32_e32 v151, 0x3fb8aa3b, v122
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v117
+  ; GCN-NEXT:    v_fma_f32 v122, s4, v124, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v112, v121
+  ; GCN-NEXT:    v_exp_f32_e32 v112, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v124, v118
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v122
+  ; GCN-NEXT:    v_fma_f32 v125, s4, v125, -v128
+  ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v119, v143
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e64 v20, v20, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v21, v21, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v22, v22, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v23, v23, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v24, v24, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v25, v25, v112
+  ; GCN-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v123, v124
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v119
+  ; GCN-NEXT:    v_fma_f32 v124, s4, v126, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v120
+  ; GCN-NEXT:    v_exp_f32_e32 v121, v148
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v122, v149
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v130, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v124
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v121
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v125
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v96, -v128
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v123, v150
+  ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v101, -v128
+  ; GCN-NEXT:    v_fma_f32 v64, s4, v64, -v128
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v65, -v128
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v68, -v128
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v124, v151
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v197
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v122
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v129
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v97, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v123
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v97, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v98, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v124
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v99, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v98, v138
+  ; GCN-NEXT:    v_exp_f32_e32 v99, v127
+  ; GCN-NEXT:    v_mul_f32_e32 v150, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_pack_b32_f16 v127, v136, v134
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v197 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v100, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v96
+  ; GCN-NEXT:    v_exp_f32_e32 v100, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v97
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
-  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
-  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
-  ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
+  ; GCN-NEXT:    ds_write_b64 v199, v[188:189]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
+  ; GCN-NEXT:    ds_write_b64 v200, v[190:191]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
-  ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    ; implicit-def: $vgpr38
+  ; GCN-NEXT:    ds_write_b64 v201, v[192:193]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
-  ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
-  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
+  ; GCN-NEXT:    ds_write_b64 v202, v[194:195]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v101, v125
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v130, v131
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v98
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
+  ; GCN-NEXT:    v_fma_f32 v134, s4, v102, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v134
+  ; GCN-NEXT:    buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr36
-  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
-  ; GCN-NEXT:    ; implicit-def: $vgpr37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v102, v142
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v139
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
-  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
-  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
-  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
-  ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
-  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
-  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
-  ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
-  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
-  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
-  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
-  ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
-  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
-  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
-  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
-  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
-  ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v130, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
-  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
-  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v99
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v103, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v103, v150
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v105, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v147, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_perm_b32 v152, v135, v131, s5
+  ; GCN-NEXT:    v_perm_b32 v154, v135, v131, s7
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v104, -v128
+  ; GCN-NEXT:    v_perm_b32 v126, v134, v130, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_perm_b32 v150, v134, v130, s7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v100
+  ; GCN-NEXT:    v_exp_f32_e32 v104, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v101
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_perm_b32 v127, v144, v142, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v134, v135
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v106, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v105, v125
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v102
+  ; GCN-NEXT:    v_perm_b32 v151, v144, v142, s7
+  ; GCN-NEXT:    v_perm_b32 v153, v145, v143, s5
+  ; GCN-NEXT:    v_perm_b32 v155, v145, v143, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v106, v156
+  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v103
+  ; GCN-NEXT:    v_fma_f32 v136, s4, v107, -v128
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v134, v135
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v136
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v107, v138
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v108, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v104
+  ; GCN-NEXT:    v_exp_f32_e32 v108, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v105
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v142, s4, v109, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v109, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v142
+  ; GCN-NEXT:    v_pack_b32_f16 v142, v130, v131
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v110, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v106
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31]
+  ; GCN-NEXT:    v_mul_f32_e32 v134, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v107
+  ; GCN-NEXT:    v_exp_f32_e32 v110, v156
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v111, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v135, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_pack_b32_f16 v143, v130, v131
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v111, v146
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v80, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v108
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v129
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v197
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v139, v109
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v144, s4, v81, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v144
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v138, v139
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v110
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v82, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v111
+  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v83, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v157, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v135
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v136, v134
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v197 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
-  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
+  ; GCN-NEXT:    ds_write_b64 v199, v[126:127]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
-  ; GCN-NEXT:    v_exp_f32_e32 v144, v22
+  ; GCN-NEXT:    ds_write_b64 v200, v[150:151]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]
-  ; GCN-NEXT:    ; implicit-def: $vgpr17
-  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    ds_write_b64 v201, v[152:153]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[42:43]
-  ; GCN-NEXT:    v_add_u32_e32 v22, v132, v22
-  ; GCN-NEXT:    v_add_u32_e32 v17, v132, v17
-  ; GCN-NEXT:    ; implicit-def: $vgpr20
-  ; GCN-NEXT:    ; implicit-def: $vgpr21
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v202, v[154:155]
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v84, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v129
+  ; GCN-NEXT:    v_fma_f32 v130, s4, v85, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v81
+  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v127
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
+  ; GCN-NEXT:    v_fma_f32 v134, s4, v86, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v158, 0x3fb8aa3b, v134
+  ; GCN-NEXT:    buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
-  ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
-  ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v132, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v82
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v156
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v83
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
-  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v139
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v87, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v157
+  ; GCN-NEXT:    v_pack_b32_f16 v127, v127, v138
+  ; GCN-NEXT:    v_fma_f32 v138, s4, v89, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    v_perm_b32 v154, v135, v131, s5
+  ; GCN-NEXT:    v_perm_b32 v156, v135, v131, s7
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v88, -v128
+  ; GCN-NEXT:    v_perm_b32 v150, v134, v130, s5
+  ; GCN-NEXT:    v_perm_b32 v152, v134, v130, s7
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v85
+  ; GCN-NEXT:    v_perm_b32 v151, v146, v142, s5
+  ; GCN-NEXT:    v_perm_b32 v153, v146, v142, s7
+  ; GCN-NEXT:    v_perm_b32 v155, v147, v143, s5
+  ; GCN-NEXT:    v_perm_b32 v157, v147, v143, s7
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v125
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v134, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v86
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v90, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v138
+  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v158
+  ; GCN-NEXT:    v_mul_f32_e32 v158, 0x3fb8aa3b, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v87
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v91, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v139
+  ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v134, v126
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v130, s4, v92, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v88
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v89
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v93, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v130, v126, v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v125
+  ; GCN-NEXT:    v_fma_f32 v126, s4, v94, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v125, v90
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v126
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v91
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v95, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v93
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v95, v127
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_pack_b32_f16 v131, v125, v126
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v125, v129
+  ; GCN-NEXT:    ds_read_b128 v[132:135], v197
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v139 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v66, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v126, v142
+  ; GCN-NEXT:    v_pack_b32_f16 v142, v127, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v94
+  ; GCN-NEXT:    v_mul_f32_e32 v145, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v65, v95
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v67, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v127, v143
+  ; GCN-NEXT:    v_pack_b32_f16 v143, v64, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v138
+  ; GCN-NEXT:    v_mul_f32_e32 v141, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[136:139], v197 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v62, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
-  ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
-  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
-  ; GCN-NEXT:    ; implicit-def: $sgpr0
-  ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
-  ; GCN-NEXT:    v_exp_f32_e32 v145, v16
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
-  ; GCN-NEXT:    v_exp_f32_e32 v35, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v34, s4, v27, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    ds_read_b128 v[24:27], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v46, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v47
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v132
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v47, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v36, v62
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v34
-  ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
-  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
-  ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v33, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v36, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
-  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_exp_f32_e32 v39, v24
-  ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
-  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
-  ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v34, v1
-  ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
-  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
-  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
-  ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v38, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
-  ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
-  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
-  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v135, v[4:5]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
-  ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_exp_f32_e32 v42, v1
-  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
+  ; GCN-NEXT:    ds_write_b64 v199, v[150:151]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
+  ; GCN-NEXT:    ds_write_b64 v200, v[152:153]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v125
+  ; GCN-NEXT:    v_exp_f32_e32 v130, v158
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[0:1]
+  ; GCN-NEXT:    ds_write_b64 v201, v[154:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[26:27]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_write_b64 v202, v[156:157]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_pack_b32_f16 v16, v37, v28
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v7, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v6
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v139
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v9, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v29, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v41
-  ; GCN-NEXT:    v_fma_f32 v30, s4, v10, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v42
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v25
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v17, v2, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v19, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    ds_read_b128 v[8:11], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v28
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v26
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v29
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v13, -v134
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v5, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v9
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21
-  ; GCN-NEXT:    v_exp_f32_e32 v21, v9
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v18
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30
-  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28
-  ; GCN-NEXT:    v_exp_f32_e32 v2, v2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v17, v20
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v3, v0
-  ; GCN-NEXT:    v_add_f32_e32 v3, 0, v49
-  ; GCN-NEXT:    v_add_f32_e32 v3, v50, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v51, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v52, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v53, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v145, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
-  ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
-  ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v42, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v25, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v26, v3
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22
-  ; GCN-NEXT:    v_add_f32_e32 v3, v29, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v31, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
-  ; GCN-NEXT:    v_add_f32_e32 v3, v19, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v24, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v27, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v30, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v16, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v18, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v21, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v2, v3
-  ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
-  ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v126
+  ; GCN-NEXT:    v_exp_f32_e32 v131, v144
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v71, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v140, v132, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v129
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v127
+  ; GCN-NEXT:    v_exp_f32_e32 v132, v145
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v70, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_fma_f32 v145, s4, v73, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v145
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v133, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_pack_b32_f16 v141, v64, v68
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v72, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v146
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v143, v131
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pack_b32_f16 v64, v64, v143
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v144
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v132
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v74, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v65, v133
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v65, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63]
+  ; GCN-NEXT:    v_fma_f32 v138, s4, v75, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v138
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v76, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v76, v146
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v73
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v77, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v77, v147
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v66, v68
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v78, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v74
+  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v139, v76
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v75
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v79, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v79, v148
+  ; GCN-NEXT:    v_mul_f32_e32 v128, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v66, v64
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v146
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v197
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v137, v147
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v77
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v138, v138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v78
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+  ; GCN-NEXT:    s_nop 10
+  ; GCN-NEXT:    v_exp_f32_e32 v52, v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v142
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v53, v52
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v49, v79
+  ; GCN-NEXT:    v_pack_b32_f16 v50, v51, v50
+  ; GCN-NEXT:    v_pack_b32_f16 v48, v139, v136
+  ; GCN-NEXT:    v_pack_b32_f16 v51, v54, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, 0, v113
+  ; GCN-NEXT:    v_add_f32_e32 v53, v114, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v115, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v116, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v117, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v118, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v119, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v120, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v121, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v122, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v123, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v124, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v96, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v97, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v98, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v99, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v100, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v101, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v102, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v103, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v104, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v105, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v106, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v107, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v108, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v109, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v110, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v111, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v80, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v81, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v82, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v83, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v84, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v85, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v86, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v87, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v88, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v89, v53
+  ; GCN-NEXT:    v_pack_b32_f16 v49, v140, v49
+  ; GCN-NEXT:    v_add_f32_e32 v53, v90, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v91, v53
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
+  ; GCN-NEXT:    v_add_f32_e32 v53, v92, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v93, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v94, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v95, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v125, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v126, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v127, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v129, v53
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
+  ; GCN-NEXT:    s_nop 9
+  ; GCN-NEXT:    v_add_f32_e32 v0, v130, v53
+  ; GCN-NEXT:    v_add_f32_e32 v0, v131, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v132, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v133, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v72, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v73, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v74, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v75, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v76, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v77, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v78, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v79, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v142, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v137, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v138, v0
+  ; GCN-NEXT:    v_add_f32_e32 v4, v52, v0
+  ; GCN-NEXT:    ds_bpermute_b32 v5, v196, v4
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
   ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v196, v2
   ; GCN-NEXT:    ; implicit-def: $vgpr4
-  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[12:13]
+  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v112
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
   ; GCN-NEXT:    s_endpgm
 
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 7959cee..e174fc1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1ff80, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:64
-; GCN-NEXT:    ds_read_b128 a[0:3], v1
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:16
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:32
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT:    v_add_u32_e32 v3, s0, v0
+; GCN-NEXT:    ds_read_b128 a[28:31], v3 offset:112
+; GCN-NEXT:    ds_read_b128 a[24:27], v3 offset:96
+; GCN-NEXT:    ds_read_b128 a[20:23], v3 offset:80
+; GCN-NEXT:    ds_read_b128 a[16:19], v3 offset:64
+; GCN-NEXT:    ds_read_b128 a[0:3], v3
+; GCN-NEXT:    ds_read_b128 a[4:7], v3 offset:16
+; GCN-NEXT:    ds_read_b128 a[8:11], v3 offset:32
+; GCN-NEXT:    ds_read_b128 a[12:15], v3 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    ds_read_b128 a[156:159], v1 offset:8304
-; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:8288
-; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:8272
-; GCN-NEXT:    ds_read_b128 a[144:147], v1 offset:8256
-; GCN-NEXT:    ds_read_b128 a[140:143], v1 offset:8240
-; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:8224
-; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:8208
-; GCN-NEXT:    ds_read_b128 a[128:131], v1 offset:8192
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-NEXT:    ds_read_b128 a[156:159], v3 offset:8304
+; GCN-NEXT:    ds_read_b128 a[152:155], v3 offset:8288
+; GCN-NEXT:    ds_read_b128 a[148:151], v3 offset:8272
+; GCN-NEXT:    ds_read_b128 a[144:147], v3 offset:8256
+; GCN-NEXT:    ds_read_b128 a[140:143], v3 offset:8240
+; GCN-NEXT:    ds_read_b128 a[136:139], v3 offset:8224
+; GCN-NEXT:    ds_read_b128 a[132:135], v3 offset:8208
+; GCN-NEXT:    ds_read_b128 a[128:131], v3 offset:8192
+; GCN-NEXT:    v_add_u32_e32 v4, 0x6000, v3
 ; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000001)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
-; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT:    ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:24576
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
+; GCN-NEXT:    ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT:    ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT:    ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT:    ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT:    ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT:    ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT:    ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT:    ds_read_b128 a[96:99], v3 offset:24576
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
-; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6000, v1
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT:    ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT:    ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT:    ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT:    ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT:    ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT:    ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT:    ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT:    ds_read_b128 a[64:67], v3 offset:49152
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
-; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:57456
-; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:57440
-; GCN-NEXT:    ds_read_b128 a[52:55], v1 offset:57424
-; GCN-NEXT:    ds_read_b128 a[48:51], v1 offset:57408
-; GCN-NEXT:    ds_read_b128 a[32:35], v1 offset:57344
-; GCN-NEXT:    ds_read_b128 a[36:39], v1 offset:57360
-; GCN-NEXT:    ds_read_b128 a[40:43], v1 offset:57376
-; GCN-NEXT:    ds_read_b128 a[44:47], v1 offset:57392
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT:    ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT:    ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT:    ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT:    ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT:    ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT:    ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT:    ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT:    ds_read_b128 a[44:47], v4 offset:57392
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
 ; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index aa099b6..b65a1a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
-; GCN-NEXT:    ds_read_b128 a[156:159], v1 offset:112
-; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:96
-; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:80
-; GCN-NEXT:    ds_read_b128 a[144:147], v1 offset:64
-; GCN-NEXT:    ds_read_b128 a[128:131], v1
-; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:16
-; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:32
-; GCN-NEXT:    ds_read_b128 a[140:143], v1 offset:48
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT:    v_add_u32_e32 v2, 0x6000, v1
-; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT:    ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:24576
-; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
-; GCN-NEXT:    ds_read_b128 a[60:63], v2 offset:57456
-; GCN-NEXT:    ds_read_b128 a[56:59], v2 offset:57440
-; GCN-NEXT:    ds_read_b128 a[52:55], v2 offset:57424
-; GCN-NEXT:    ds_read_b128 a[48:51], v2 offset:57408
-; GCN-NEXT:    ds_read_b128 a[32:35], v2 offset:57344
-; GCN-NEXT:    ds_read_b128 a[36:39], v2 offset:57360
-; GCN-NEXT:    ds_read_b128 a[40:43], v2 offset:57376
-; GCN-NEXT:    ds_read_b128 a[44:47], v2 offset:57392
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_add_u32_e32 v3, s0, v0
+; GCN-NEXT:    ds_read_b128 a[156:159], v3 offset:112
+; GCN-NEXT:    ds_read_b128 a[152:155], v3 offset:96
+; GCN-NEXT:    ds_read_b128 a[148:151], v3 offset:80
+; GCN-NEXT:    ds_read_b128 a[144:147], v3 offset:64
+; GCN-NEXT:    ds_read_b128 a[128:131], v3
+; GCN-NEXT:    ds_read_b128 a[132:135], v3 offset:16
+; GCN-NEXT:    ds_read_b128 a[136:139], v3 offset:32
+; GCN-NEXT:    ds_read_b128 a[140:143], v3 offset:48
+; GCN-NEXT:    v_add_u32_e32 v4, 0x6000, v3
+; GCN-NEXT:    ds_read_b128 a[28:31], v3 offset:8304
+; GCN-NEXT:    ds_read_b128 a[24:27], v3 offset:8288
+; GCN-NEXT:    ds_read_b128 a[20:23], v3 offset:8272
+; GCN-NEXT:    ds_read_b128 a[16:19], v3 offset:8256
+; GCN-NEXT:    ds_read_b128 a[12:15], v3 offset:8240
+; GCN-NEXT:    ds_read_b128 a[8:11], v3 offset:8224
+; GCN-NEXT:    ds_read_b128 a[4:7], v3 offset:8208
+; GCN-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
+; GCN-NEXT:    ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT:    ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT:    ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT:    ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT:    ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT:    ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT:    ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT:    ds_read_b128 a[96:99], v3 offset:24576
+; GCN-NEXT:    ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT:    ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT:    ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT:    ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT:    ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT:    ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT:    ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT:    ds_read_b128 a[64:67], v3 offset:49152
+; GCN-NEXT:    ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT:    ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT:    ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT:    ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT:    ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT:    ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT:    ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT:    ds_read_b128 a[44:47], v4 offset:57392
+; GCN-NEXT:    s_waitcnt lgkmcnt(14)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
 ; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(14)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(8)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT:    s_nop 12
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; GCN-NEXT:    ds_write_b128 v0, a[148:151] offset:80
@@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x1ff80, v0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 1.0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, 2.0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s0, v0
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v1 offset:112
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[152:155], v1 offset:96
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v1 offset:80
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v1 offset:64
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[128:131], v1
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[132:135], v1 offset:16
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v1 offset:32
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v1 offset:48
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v2, 0x6000, v1
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[120:123], v1 offset:24672
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[116:119], v1 offset:24656
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[112:115], v1 offset:24640
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[108:111], v1 offset:24624
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[104:107], v1 offset:24608
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[100:103], v1 offset:24592
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[96:99], v1 offset:24576
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v1 offset:49264
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[88:91], v1 offset:49248
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[84:87], v1 offset:49232
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[80:83], v1 offset:49216
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[76:79], v1 offset:49200
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[72:75], v1 offset:49184
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[68:71], v1 offset:49168
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[64:67], v1 offset:49152
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, 1.0
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[60:63], v2 offset:57456
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[56:59], v2 offset:57440
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[52:55], v2 offset:57424
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[48:51], v2 offset:57408
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[32:35], v2 offset:57344
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[36:39], v2 offset:57360
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[40:43], v2 offset:57376
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[44:47], v2 offset:57392
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 2.0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v3, s0, v0
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v3 offset:112
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[152:155], v3 offset:96
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v3 offset:80
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v3 offset:64
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[128:131], v3
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[132:135], v3 offset:16
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v3 offset:32
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v3 offset:48
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v4, 0x6000, v3
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v3 offset:8304
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v3 offset:8288
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v3 offset:8272
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v3 offset:8256
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v3 offset:8240
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v3 offset:8224
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v3 offset:8208
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[124:127], v3 offset:24688
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[120:123], v3 offset:24672
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[116:119], v3 offset:24656
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[112:115], v3 offset:24640
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[108:111], v3 offset:24624
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[104:107], v3 offset:24608
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[100:103], v3 offset:24592
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[96:99], v3 offset:24576
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v3 offset:49264
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[88:91], v3 offset:49248
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[84:87], v3 offset:49232
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[80:83], v3 offset:49216
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[76:79], v3 offset:49200
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[72:75], v3 offset:49184
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[68:71], v3 offset:49168
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[64:67], v3 offset:49152
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[60:63], v4 offset:57456
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[56:59], v4 offset:57440
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[52:55], v4 offset:57424
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[48:51], v4 offset:57408
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[32:35], v4 offset:57344
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[36:39], v4 offset:57360
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[40:43], v4 offset:57376
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[44:47], v4 offset:57392
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
 ; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(8)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT:    s_nop 12
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; EXACTCUTOFF-NEXT:    s_nop 11
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[148:151] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
index ddbae64..a95d8c7 100644
--- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
 
 declare i64 @llvm.readsteadycounter() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 9a23788..8803f3a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -367,77 +367,76 @@ bb:
 define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-LABEL: illegal_mfma_after_rewrite:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    v_mov_b64_e32 v[26:27], s[4:5]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v[16:19]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
-; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
+; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, s0
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, s1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
-; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT:    v_mov_b32_e32 v5, v4
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
+; CHECK-NEXT:    s_nop 3
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v24, v4
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v17, v8
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
-; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_mov_b64_e32 v[12:13], 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
-; CHECK-NEXT:    global_store_short v[12:13], v17, off
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0x7fc00000
+; CHECK-NEXT:    v_mov_b32_e32 v9, v8
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_mov_b32_e32 v11, v8
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v6
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], 0
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT:    global_store_short v[0:1], v2, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v9, v16
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT:    global_store_short v[12:13], v9, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v8
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v10, v6
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
+; CHECK-NEXT:    global_store_short v[0:1], v10, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v0
-; CHECK-NEXT:    global_store_short v[12:13], v1, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CHECK-NEXT:    global_store_short v[0:1], v6, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[12:13], v14, off
+; CHECK-NEXT:    global_store_short v[0:1], v24, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
 ; CHECK-NEXT:    s_nop 6
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v8, v0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
-; CHECK-NEXT:    global_store_short v[12:13], v8, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
+; CHECK-NEXT:    global_store_short v[0:1], v6, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    global_store_short v[12:13], v0, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT:    global_store_short v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad
 define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 {
 ; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_accvgpr_write_b32 a34, 2.0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT:    v_accvgpr_read_b32 v63, a31
-; CHECK-NEXT:    v_accvgpr_read_b32 v62, a30
-; CHECK-NEXT:    v_accvgpr_read_b32 v61, a29
-; CHECK-NEXT:    v_accvgpr_read_b32 v60, a28
-; CHECK-NEXT:    v_accvgpr_read_b32 v59, a27
-; CHECK-NEXT:    v_accvgpr_read_b32 v58, a26
-; CHECK-NEXT:    v_accvgpr_read_b32 v57, a25
-; CHECK-NEXT:    v_accvgpr_read_b32 v56, a24
-; CHECK-NEXT:    v_accvgpr_read_b32 v55, a23
-; CHECK-NEXT:    v_accvgpr_read_b32 v54, a22
-; CHECK-NEXT:    v_accvgpr_read_b32 v53, a21
-; CHECK-NEXT:    v_accvgpr_read_b32 v52, a20
-; CHECK-NEXT:    v_accvgpr_read_b32 v51, a19
-; CHECK-NEXT:    v_accvgpr_read_b32 v50, a18
-; CHECK-NEXT:    v_accvgpr_read_b32 v49, a17
-; CHECK-NEXT:    v_accvgpr_read_b32 v48, a16
-; CHECK-NEXT:    v_accvgpr_read_b32 v47, a15
-; CHECK-NEXT:    v_accvgpr_read_b32 v46, a14
-; CHECK-NEXT:    v_accvgpr_read_b32 v45, a13
-; CHECK-NEXT:    v_accvgpr_read_b32 v44, a12
-; CHECK-NEXT:    v_accvgpr_read_b32 v43, a11
-; CHECK-NEXT:    v_accvgpr_read_b32 v42, a10
-; CHECK-NEXT:    v_accvgpr_read_b32 v41, a9
-; CHECK-NEXT:    v_accvgpr_read_b32 v40, a8
-; CHECK-NEXT:    v_accvgpr_read_b32 v39, a7
-; CHECK-NEXT:    v_accvgpr_read_b32 v38, a6
-; CHECK-NEXT:    v_accvgpr_read_b32 v37, a5
-; CHECK-NEXT:    v_accvgpr_read_b32 v36, a4
-; CHECK-NEXT:    v_accvgpr_read_b32 v35, a3
-; CHECK-NEXT:    v_accvgpr_read_b32 v34, a2
-; CHECK-NEXT:    v_accvgpr_read_b32 v33, a1
-; CHECK-NEXT:    v_accvgpr_read_b32 v32, a0
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, 2.0
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, 4.0
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63]
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v32
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, v33
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v34
-; CHECK-NEXT:    v_accvgpr_write_b32 a3, v35
-; CHECK-NEXT:    v_accvgpr_write_b32 a4, v36
-; CHECK-NEXT:    v_accvgpr_write_b32 a5, v37
-; CHECK-NEXT:    v_accvgpr_write_b32 a6, v38
-; CHECK-NEXT:    v_accvgpr_write_b32 a7, v39
-; CHECK-NEXT:    v_accvgpr_write_b32 a8, v40
-; CHECK-NEXT:    v_accvgpr_write_b32 a9, v41
-; CHECK-NEXT:    v_accvgpr_write_b32 a10, v42
-; CHECK-NEXT:    v_accvgpr_write_b32 a11, v43
-; CHECK-NEXT:    v_accvgpr_write_b32 a12, v44
-; CHECK-NEXT:    v_accvgpr_write_b32 a13, v45
-; CHECK-NEXT:    v_accvgpr_write_b32 a14, v46
-; CHECK-NEXT:    v_accvgpr_write_b32 a15, v47
-; CHECK-NEXT:    v_accvgpr_write_b32 a16, v48
-; CHECK-NEXT:    v_accvgpr_write_b32 a17, v49
-; CHECK-NEXT:    v_accvgpr_write_b32 a18, v50
-; CHECK-NEXT:    v_accvgpr_write_b32 a19, v51
-; CHECK-NEXT:    v_accvgpr_write_b32 a20, v52
-; CHECK-NEXT:    v_accvgpr_write_b32 a21, v53
-; CHECK-NEXT:    v_accvgpr_write_b32 a22, v54
-; CHECK-NEXT:    v_accvgpr_write_b32 a23, v55
-; CHECK-NEXT:    v_accvgpr_write_b32 a24, v56
-; CHECK-NEXT:    v_accvgpr_write_b32 a25, v57
-; CHECK-NEXT:    v_accvgpr_write_b32 a26, v58
-; CHECK-NEXT:    v_accvgpr_write_b32 a27, v59
-; CHECK-NEXT:    v_accvgpr_write_b32 a28, v60
-; CHECK-NEXT:    v_accvgpr_write_b32 a29, v61
-; CHECK-NEXT:    v_accvgpr_write_b32 a30, v62
-; CHECK-NEXT:    v_accvgpr_write_b32 a31, v63
-; CHECK-NEXT:    v_mov_b32_e32 v33, 0x41000000
-; CHECK-NEXT:    v_mov_b32_e32 v34, 0x41800000
-; CHECK-NEXT:    v_accvgpr_read_b32 v32, a32
-; CHECK-NEXT:    v_and_b32_e32 v32, 0x3ff, v32
-; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v32, 7, v32
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
-; CHECK-NEXT:    s_nop 7
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
@@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    v_accvgpr_read_b32 v21, a21
 ; CHECK-NEXT:    v_accvgpr_read_b32 v22, a22
 ; CHECK-NEXT:    v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
 ; CHECK-NEXT:    v_accvgpr_read_b32 v28, a28
 ; CHECK-NEXT:    v_accvgpr_read_b32 v29, a29
 ; CHECK-NEXT:    v_accvgpr_read_b32 v30, a30
 ; CHECK-NEXT:    v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT:    v_accvgpr_write_b32 a33, 4.0
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31]
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x41000000
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT:    s_nop 15
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], v[32:33]
+; CHECK-NEXT:    v_mov_b64_e32 v[4:5], v[34:35]
+; CHECK-NEXT:    v_mov_b64_e32 v[6:7], v[36:37]
+; CHECK-NEXT:    v_mov_b64_e32 v[8:9], v[38:39]
+; CHECK-NEXT:    v_mov_b64_e32 v[10:11], v[40:41]
+; CHECK-NEXT:    v_mov_b64_e32 v[12:13], v[42:43]
+; CHECK-NEXT:    v_mov_b64_e32 v[14:15], v[44:45]
+; CHECK-NEXT:    v_mov_b64_e32 v[16:17], v[46:47]
+; CHECK-NEXT:    v_mov_b64_e32 v[18:19], v[48:49]
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], v[50:51]
+; CHECK-NEXT:    v_mov_b64_e32 v[22:23], v[52:53]
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], v[54:55]
+; CHECK-NEXT:    v_mov_b64_e32 v[26:27], v[56:57]
+; CHECK-NEXT:    v_mov_b64_e32 v[28:29], v[58:59]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[60:61]
+; CHECK-NEXT:    v_mov_b64_e32 v[32:33], v[62:63]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, v[22:25], s[0:1] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, v[18:21], s[0:1] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; CHECK-NEXT:    s_nop 15
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    global_store_dwordx4 v0, a[24:27], s[2:3] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, a[28:31], s[2:3] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, a[16:19], s[2:3] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, a[20:23], s[2:3] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
+; CHECK-NEXT:    global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
 ; CHECK-NEXT:    s_endpgm
   %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
   %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
new file mode 100644
index 0000000..33b2f69
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check that cycle counts are consistent with hazards.
+
+# CHECK: Cycle: 3 TopQ.A
+# CHECK: hazard:  SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c
+# CHECK-NOT: Cycle: 9 TopQ.A
+# CHECK: Cycle: 83 TopQ.A
+# CHECK: Checking pending node SU(6)
+# CHECK: Move SU(6) into Available Q
+
+---
+name:            pending_queue_ready_cycle
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr4_sgpr5
+
+    %2:sgpr_128 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %18:areg_512 = IMPLICIT_DEF
+    %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec
+    %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec
+    %7:vreg_512 = COPY %18
+    SCHED_BARRIER 0
+    S_NOP 0, implicit %18, implicit %7, implicit %84
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
index 71dcf11..196560f 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
@@ -11,11 +11,11 @@ declare void @f16_user(half)
 ; CHECK-SAME: in function four64
 ; CHECK-SAME: Type mismatch between intrinsic and DXIL op
 define void @four64() "hlsl.export" {
-  %buffer = call target("dx.CBuffer", target("dx.Layout", {double}, 8, 0))
+  %buffer = call target("dx.CBuffer", <{ double }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   %load = call {double, double, double, double} @llvm.dx.resource.load.cbufferrow.4(
-      target("dx.CBuffer", target("dx.Layout", {double}, 8, 0)) %buffer,
+      target("dx.CBuffer", <{ double }>) %buffer,
       i32 0)
   %data = extractvalue {double, double, double, double} %load, 0
 
@@ -28,11 +28,11 @@ define void @four64() "hlsl.export" {
 ; CHECK-SAME: in function two32
 ; CHECK-SAME: Type mismatch between intrinsic and DXIL op
 define void @two32() "hlsl.export" {
-  %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %buffer = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   %load = call {float, float} @llvm.dx.resource.load.cbufferrow.2(
-      target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+      target("dx.CBuffer", <{ float }>) %buffer,
       i32 0)
   %data = extractvalue {float, float} %load, 0
 
@@ -41,5 +41,5 @@ define void @two32() "hlsl.export" {
   ret void
 }
 
-declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_tdx.Layout_sl_f64s_8_0tt(target("dx.CBuffer", target("dx.Layout", { double }, 8, 0)), i32)
-declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_tdx.Layout_sl_f32s_4_0tt(target("dx.CBuffer", target("dx.Layout", { float }, 4, 0)), i32)
+declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_sl_f64st(target("dx.CBuffer", <{ double }>), i32)
+declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_sl_f32st(target("dx.CBuffer", <{ float }>), i32)
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
index d690651..dd40aa8 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
@@ -8,12 +8,12 @@ declare void @f16_user(half)
 
 ; CHECK-LABEL: define void @loadf32
 define void @loadf32() {
-  %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %buffer = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %{{.*}}, i32 0)
   %load = call {float, float, float, float} @llvm.dx.resource.load.cbufferrow.4(
-      target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+      target("dx.CBuffer", <{ float }>) %buffer,
       i32 0)
   %data = extractvalue {float, float, float, float} %load, 0
 
@@ -27,12 +27,12 @@ define void @loadf32() {
 ; CHECK-LABEL: define void @loadf64
 define void @loadf64() {
   %buffer = call
-      target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24))
+      target("dx.CBuffer", <{ <4 x double> }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %{{.*}}, i32 1)
   %load = call {double, double} @llvm.dx.resource.load.cbufferrow.2(
-      target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24)) %buffer,
+      target("dx.CBuffer", <{ <4 x double> }>) %buffer,
       i32 1)
   %data = extractvalue {double, double} %load, 1
 
@@ -46,12 +46,12 @@ define void @loadf64() {
 ; CHECK-LABEL: define void @loadf16
 define void @loadf16() {
   %buffer = call
-      target("dx.CBuffer", target("dx.Layout", {half}, 2, 0))
+      target("dx.CBuffer", <{ half }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %{{.*}}, i32 0)
   %load = call {half, half, half, half, half, half, half, half} @llvm.dx.resource.load.cbufferrow.8(
-      target("dx.CBuffer", target("dx.Layout", {half}, 2, 0)) %buffer,
+      target("dx.CBuffer", <{ half }>) %buffer,
       i32 0)
   %data = extractvalue {half, half, half, half, half, half, half, half} %load, 0
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
index bcf82a6..5cd67be 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
@@ -18,7 +18,7 @@ define void @main() #0 {
   %srv0 = call target("dx.RawBuffer", i8, 0, 0)
       @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
           i32 1, i32 8, i32 1, i32 0, ptr null)
-  %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cbuf = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
index 70224fc..d792078 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
@@ -14,7 +14,7 @@ define void @main() #0 {
 ; CHECK:          Kind:            CBuffer
 ; CHECK:          Flags:
 ; CHECK:            UsedByAtomic64:  false
-  %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cbuf = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
 
   ; ByteAddressBuffer Buf : register(t8, space1)
diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
index 38f2de2..671fcef 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
@@ -72,7 +72,7 @@ define void @test_bindings() {
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) #[[#ATTR]]
 
   ; cbuffer cb0 : register(b0) { int4 i; float4 f; }
-  %cb0 = call target("dx.CBuffer", target("dx.Layout", {<4 x i32>, <4 x float>}, 32, 0, 16))
+  %cb0 = call target("dx.CBuffer", <{ <4 x i32>, <4 x float> }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
   ; CHECK: [[BUF6:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false) #[[#ATTR]]
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF6]], %dx.types.ResourceProperties { i32 13, i32 32 }) #[[#ATTR]]
diff --git a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
index 26b157f..d674863 100644
--- a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
+++ b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
@@ -4,27 +4,27 @@
 %__cblayout_CB2 = type <{ float }>
 %struct.Scalars = type { float, i32, i32 }
 
-@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) poison
-@CB2.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) poison
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+@CB2.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB2) poison
 
 define void @main() local_unnamed_addr #1 {
 entry:
   ; CHECK: [[CB:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefrombinding
-  %h = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
-  store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %h, ptr @CB.cb, align 4
+  %h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %h, ptr @CB.cb, align 4
   %_ZL3Out_h.i.i = tail call target("dx.RawBuffer", %struct.Scalars, 1, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
   ; CHECK-NOT: load target({{.*}}), ptr @CB.cb
-  %cb = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)), ptr @CB.cb, align 4
+  %cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
   ; CHECK: call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target({{.*}}) [[CB]], i32 0)
-  %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %cb, i32 0)
+  %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", %__cblayout_CB) %cb, i32 0)
   %1 = extractvalue { float, float, float, float } %0, 0
   call void @llvm.dx.resource.store.rawbuffer(target("dx.RawBuffer", %struct.Scalars, 1, 0) %_ZL3Out_h.i.i, i32 0, i32 0, float %1)
-  
+
   ; CHECK: [[CB2:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefromimplicitbinding
-  %h2 = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
-  store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) %h2, ptr @CB2.cb, align 4
+  %h2 = tail call target("dx.CBuffer", %__cblayout_CB2) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB2) %h2, ptr @CB2.cb, align 4
   ; CHECK-NOT: load target({{.*}}), ptr @CB2.cb
-  %cb2 = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)), ptr @CB2.cb, align 4
+  %cb2 = load target("dx.CBuffer", %__cblayout_CB2), ptr @CB2.cb, align 4
 
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
index f1d28e2..85952c9 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
@@ -1,3 +1,6 @@
+; TODO: Remove this test once we've updated the frontend to use explicit
+; padding. The cbuffer-metadata.ll test covers the newer logic.
+
 ; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
 ; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
 ; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
new file mode 100644
index 0000000..6b90e17
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
@@ -0,0 +1,89 @@
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
+; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+%__cblayout_CB1 = type <{ float, i32, double, <2 x i32> }>
+@CB1.cb = global target("dx.CBuffer", %__cblayout_CB1) poison
+@CB1.str = private unnamed_addr constant [4 x i8] c"CB1\00", align 1
+
+%__cblayout_CB2 = type <{ float, target("dx.Padding", 4), double, float, half, i16, i64, i32 }>
+@CB2.cb = global target("dx.CBuffer", %__cblayout_CB2) poison
+@CB2.str = private unnamed_addr constant [4 x i8] c"CB2\00", align 1
+
+%__cblayout_MyConstants = type <{
+  double, target("dx.Padding", 8),
+  <3 x float>, float,
+  <3 x double>, half, target("dx.Padding", 6),
+  <2 x double>,
+  float, <3 x half>, <3 x half>
+}>
+@MyConstants.cb = global target("dx.CBuffer", %__cblayout_MyConstants) poison
+@MyConstants.str = private unnamed_addr constant [12 x i8] c"MyConstants\00", align 1
+
+; PRINT:; Resource Bindings:
+; PRINT-NEXT:;
+; PRINT-NEXT:; Name            Type  Format  Dim   ID    HLSL Bind  Count
+; PRINT-NEXT:; ----
+; PRINT-NEXT:; CB1          cbuffer      NA   NA  CB0          cb0     1
+; PRINT-NEXT:; CB2          cbuffer      NA   NA  CB1          cb1     1
+; PRINT-NEXT:; MyConstants  cbuffer      NA   NA  CB2  cb5,space15     1
+
+define void @test() #0 {
+
+  ; cbuffer CB1 : register(b0) {
+  ;   float a;
+  ;   int b;
+  ;   double c;
+  ;   int2 d;
+  ; }
+  %CB1.cb_h = call target("dx.CBuffer", %__cblayout_CB1)
+            @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr @CB1.str)
+
+  ; cbuffer CB2 : register(b0) {
+  ;   float a;
+  ;   double b;
+  ;   float c;
+  ;   half d;
+  ;   uint16_t e;
+  ;   int64_t f;
+  ;   int g;
+  ;}
+  %CB2.cb_h = call target("dx.CBuffer", %__cblayout_CB2)
+            @llvm.dx.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, ptr @CB2.str)
+
+  ; cbuffer CB3 : register(b5) {
+  ;   double B0;
+  ;   float3 B1;
+  ;   float B2;
+  ;   double3 B3;
+  ;   half B4;
+  ;   double2 B5;
+  ;   float B6;
+  ;   half3 B7;
+  ;   half3 B8;
+  ; }
+  %CB3.cb_h = call target("dx.CBuffer", %__cblayout_MyConstants)
+            @llvm.dx.resource.handlefrombinding(i32 15, i32 5, i32 1, i32 0, ptr @MyConstants.str)
+
+  ret void
+}
+
+attributes #0 = { noinline nounwind "hlsl.shader"="compute" }
+
+; CHECK: %CBuffer.CB1 = type { { float, i32, double, <2 x i32> } }
+; CHECK: %CBuffer.CB2 = type { { float, double, float, half, i16, i64, i32 } }
+; CHECK: %CBuffer.MyConstants = type { { double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> } }
+
+; CHECK: @CB1 = external constant %CBuffer.CB1
+; CHECK: @CB2 = external constant %CBuffer.CB2
+; CHECK: @MyConstants = external constant %CBuffer.MyConstants
+
+; CHECK: !dx.resources = !{[[ResList:[!][0-9]+]]}
+
+; CHECK: [[ResList]] = !{null, null, [[CBList:[!][0-9]+]], null}
+; CHECK: [[CBList]] = !{![[CB1:[0-9]+]], ![[CB2:[0-9]+]], ![[MYCONSTANTS:[0-9]+]]}
+; CHECK: ![[CB1]] = !{i32 0, ptr @CB1, !"CB1", i32 0, i32 0, i32 1, i32 24, null}
+; CHECK: ![[CB2]] = !{i32 1, ptr @CB2, !"CB2", i32 0, i32 1, i32 1, i32 36, null}
+; CHECK: ![[MYCONSTANTS]] = !{i32 2, ptr @MyConstants, !"MyConstants", i32 15, i32 5, i32 1, i32 96, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
index e2a1c09..0b454c1 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
@@ -7,7 +7,7 @@
 target triple = "dxil-pc-shadermodel6.6-compute"
 
 define void @cbuffer_is_only_binding() {
-  %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cbuf = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr null)
   ; CHECK: %CBuffer = type { float }
 
diff --git a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
index 153ca10..72f10ae 100644
--- a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
@@ -1141,29 +1141,88 @@ define <2 x i32> @test_select_cc(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
   ret <2 x i32> %r
 }
 
-define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
-; CHECK-NOI32X2-LABEL: test_trunc_2xi32(
+define <2 x i16> @test_trunc_2xi32_to_2xi16(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi16(
 ; CHECK-NOI32X2:       {
 ; CHECK-NOI32X2-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOI32X2-EMPTY:
 ; CHECK-NOI32X2-NEXT:  // %bb.0:
-; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi16_param_0];
 ; CHECK-NOI32X2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
 ; CHECK-NOI32X2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NOI32X2-NEXT:    ret;
 ;
-; CHECK-I32X2-LABEL: test_trunc_2xi32(
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi16(
 ; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<4>;
 ; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-I32X2-EMPTY:
 ; CHECK-I32X2-NEXT:  // %bb.0:
-; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_param_0];
-; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi16_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
+; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-I32X2-NEXT:    ret;
   %r = trunc <2 x i32> %a to <2 x i16>
   ret <2 x i16> %r
 }
 
+define <2 x i8> @test_trunc_2xi32_to_2xi8(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-NOI32X2-NEXT:    cvt.u16.u32 %rs1, %r2;
+; CHECK-NOI32X2-NEXT:    cvt.u16.u32 %rs2, %r1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b16 %rs<3>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    cvt.u16.u32 %rs1, %r2;
+; CHECK-I32X2-NEXT:    cvt.u16.u32 %rs2, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-I32X2-NEXT:    ret;
+  %r = trunc <2 x i32> %a to <2 x i8>
+  ret <2 x i8> %r
+}
+
+define <2 x i1> @test_trunc_2xi32_to_2xi1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.b8 [func_retval0], %r1;
+; CHECK-NOI32X2-NEXT:    st.param.b8 [func_retval0+1], %r2;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    st.param.b8 [func_retval0], %r1;
+; CHECK-I32X2-NEXT:    st.param.b8 [func_retval0+1], %r2;
+; CHECK-I32X2-NEXT:    ret;
+  %r = trunc <2 x i32> %a to <2 x i1>
+  ret <2 x i1> %r
+}
+
 define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
 ; CHECK-LABEL: test_trunc_2xi64(
 ; CHECK:       {
@@ -1180,14 +1239,14 @@ define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
   ret <2 x i32> %r
 }
 
-define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
-; CHECK-LABEL: test_zext_2xi32(
+define <2 x i32> @test_zext_2xi16_to_2xi32(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_zext_2xi16_to_2xi32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi16_to_2xi32_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
@@ -1197,6 +1256,47 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
   ret <2 x i32> %r
 }
 
+define <2 x i32> @test_zext_2xi8_to_2xi32(<2 x i8> %a) #0 {
+; CHECK-LABEL: test_zext_2xi8_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_zext_2xi8_to_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT:    ret;
+  %r = zext <2 x i8> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_zext_2xi1_to_2xi32(<2 x i1> %a) #0 {
+; CHECK-LABEL: test_zext_2xi1_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_zext_2xi1_to_2xi32_param_0+1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p2, %rs2, 0;
+; CHECK-NEXT:    ld.param.b8 %rs3, [test_zext_2xi1_to_2xi32_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs4, 0;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs3;
+; CHECK-NEXT:    and.b32 %r4, %r3, 1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r2};
+; CHECK-NEXT:    ret;
+  %r = zext <2 x i1> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
 define <2 x i64> @test_zext_2xi64(<2 x i32> %a) #0 {
 ; CHECK-NOI32X2-LABEL: test_zext_2xi64(
 ; CHECK-NOI32X2:       {
@@ -1566,6 +1666,55 @@ entry:
   ret void
 }
 
+define <2 x i32> @test_sext_v2i8_to_v2i32 (<2 x i8> %a) {
+; CHECK-LABEL: test_sext_v2i8_to_v2i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_sext_v2i8_to_v2i32_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    cvt.s32.s8 %r3, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs1;
+; CHECK-NEXT:    cvt.s32.s8 %r5, %r4;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r5, %r3};
+; CHECK-NEXT:    ret;
+  %r = sext <2 x i8> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_sext_v2i16_to_v2i32 (<2 x i16> %a) {
+; CHECK-NOI32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-NOI32X2-NEXT:    cvt.s32.s16 %r2, %r1;
+; CHECK-NOI32X2-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; CHECK-NOI32X2-NEXT:    cvt.s32.s16 %r3, %rs1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b16 %rs<2>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-I32X2-NEXT:    cvt.s32.s16 %r2, %r1;
+; CHECK-I32X2-NEXT:    mov.b32 {_, %rs1}, %r1;
+; CHECK-I32X2-NEXT:    cvt.s32.s16 %r3, %rs1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = sext <2 x i16> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
 define <2 x float> @test_uitofp_v2i32(<2 x i32> %a) {
 ; CHECK-NOI32X2-LABEL: test_uitofp_v2i32(
 ; CHECK-NOI32X2:       {
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f3529b1..22c2d81 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -80,6 +80,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV32COMBINEINTOA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCB %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCD %s
@@ -227,6 +228,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+ztso %s -o - | FileCheck --check-prefixes=CHECK,RV64ZTSO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV64ZAAMO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV64ZALRSC %s
+; RUN: llc -mtriple=riscv64 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCB %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCD %s
@@ -392,6 +394,7 @@
 ; RV32XWCHC: .attribute 5, "rv32i2p1_zca1p0_xwchc2p2"
 ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
 ; RV32ZALRSC: .attribute 5, "rv32i2p1_zalrsc1p0"
+; RV32COMBINEINTOA: .attribute 5, "rv32i2p1_a2p1_zaamo1p0_zalrsc1p0"
 ; RV32ZCA: .attribute 5, "rv32i2p1_zca1p0"
 ; RV32ZCB: .attribute 5, "rv32i2p1_zca1p0_zcb1p0"
 ; RV32ZCD: .attribute 5, "rv32i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
@@ -537,6 +540,7 @@
 ; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0"
 ; RV64ZAAMO: .attribute 5, "rv64i2p1_zaamo1p0"
 ; RV64ZALRSC: .attribute 5, "rv64i2p1_zalrsc1p0"
+; RV64COMBINEINTOA: .attribute 5, "rv64i2p1_a2p1_zaamo1p0_zalrsc1p0"
 ; RV64ZCA: .attribute 5, "rv64i2p1_zca1p0"
 ; RV64ZCB: .attribute 5, "rv64i2p1_zca1p0_zcb1p0"
 ; RV64ZCD: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index 9937627..d7b00f6 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -1,16 +1,2315 @@
-; RUN: llc -mtriple=riscv32 < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 < %s | FileCheck %s --check-prefix=RV64
+
+define i64 @udiv_i64(i64 %x, i64 %y) nounwind {
+; RV32-LABEL: udiv_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    call __udivdi3
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail __udivdi3
+  %res = udiv i64 %x, %y
+  ret i64 %res
+}
+
+define i65 @udiv_i65(i65 %x, i65 %y) nounwind {
+; RV32-LABEL: udiv_i65:
+; RV32:       # %bb.0: # %_udiv-special-cases
+; RV32-NEXT:    lw a3, 0(a2)
+; RV32-NEXT:    lw a4, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    lui a5, 209715
+; RV32-NEXT:    lui a6, 61681
+; RV32-NEXT:    addi t0, a2, 1365
+; RV32-NEXT:    addi a7, a5, 819
+; RV32-NEXT:    addi a6, a6, -241
+; RV32-NEXT:    srli a2, a4, 1
+; RV32-NEXT:    slli a5, t1, 31
+; RV32-NEXT:    slli t3, a4, 31
+; RV32-NEXT:    or t2, a5, a2
+; RV32-NEXT:    srli a2, a3, 1
+; RV32-NEXT:    or t4, a2, t3
+; RV32-NEXT:    bnez t2, .LBB1_2
+; RV32-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV32-NEXT:    srli a2, t4, 1
+; RV32-NEXT:    or a2, t4, a2
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t0
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, a7
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, a7
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, a6
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli a2, a2, 24
+; RV32-NEXT:    addi t3, a2, 32
+; RV32-NEXT:    j .LBB1_3
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    srli a2, t2, 1
+; RV32-NEXT:    or a2, t2, a2
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t0
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, a7
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, a7
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, a6
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli t3, a2, 24
+; RV32-NEXT:  .LBB1_3: # %_udiv-special-cases
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    sw s0, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a2, a3, 31
+; RV32-NEXT:    li t5, 64
+; RV32-NEXT:    bnez a2, .LBB1_5
+; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV32-NEXT:    li s0, 64
+; RV32-NEXT:    j .LBB1_6
+; RV32-NEXT:  .LBB1_5:
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t0
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, a7
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, a7
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, a6
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli s0, a2, 24
+; RV32-NEXT:  .LBB1_6: # %_udiv-special-cases
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw s2, 8(a1)
+; RV32-NEXT:    or a1, t4, t2
+; RV32-NEXT:    addi s1, s0, 64
+; RV32-NEXT:    bnez a1, .LBB1_8
+; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV32-NEXT:    mv t3, s1
+; RV32-NEXT:  .LBB1_8: # %_udiv-special-cases
+; RV32-NEXT:    snez s4, a1
+; RV32-NEXT:    srli a1, a2, 1
+; RV32-NEXT:    slli t2, s2, 31
+; RV32-NEXT:    slli t4, a2, 31
+; RV32-NEXT:    or a1, t2, a1
+; RV32-NEXT:    srli t2, a5, 1
+; RV32-NEXT:    or t6, t2, t4
+; RV32-NEXT:    bnez a1, .LBB1_10
+; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV32-NEXT:    srli t2, t6, 1
+; RV32-NEXT:    or t2, t6, t2
+; RV32-NEXT:    srli t4, t2, 2
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 8
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 16
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    not t2, t2
+; RV32-NEXT:    srli t4, t2, 1
+; RV32-NEXT:    and t4, t4, t0
+; RV32-NEXT:    sub t2, t2, t4
+; RV32-NEXT:    and t4, t2, a7
+; RV32-NEXT:    srli t2, t2, 2
+; RV32-NEXT:    and t2, t2, a7
+; RV32-NEXT:    add t2, t4, t2
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    and t2, t2, a6
+; RV32-NEXT:    slli t4, t2, 8
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    slli t4, t2, 16
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    srli t2, t2, 24
+; RV32-NEXT:    addi s3, t2, 32
+; RV32-NEXT:    j .LBB1_11
+; RV32-NEXT:  .LBB1_10:
+; RV32-NEXT:    srli t2, a1, 1
+; RV32-NEXT:    or t2, a1, t2
+; RV32-NEXT:    srli t4, t2, 2
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 8
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 16
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    not t2, t2
+; RV32-NEXT:    srli t4, t2, 1
+; RV32-NEXT:    and t4, t4, t0
+; RV32-NEXT:    sub t2, t2, t4
+; RV32-NEXT:    and t4, t2, a7
+; RV32-NEXT:    srli t2, t2, 2
+; RV32-NEXT:    and t2, t2, a7
+; RV32-NEXT:    add t2, t4, t2
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    and t2, t2, a6
+; RV32-NEXT:    slli t4, t2, 8
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    slli t4, t2, 16
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    srli s3, t2, 24
+; RV32-NEXT:  .LBB1_11: # %_udiv-special-cases
+; RV32-NEXT:    andi t4, s2, 1
+; RV32-NEXT:    andi t1, t1, 1
+; RV32-NEXT:    or t2, a3, a4
+; RV32-NEXT:    or s2, a5, a2
+; RV32-NEXT:    sltu s0, s1, s0
+; RV32-NEXT:    slli s1, a5, 31
+; RV32-NEXT:    addi s4, s4, -1
+; RV32-NEXT:    beqz s1, .LBB1_13
+; RV32-NEXT:  # %bb.12:
+; RV32-NEXT:    srli t5, s1, 1
+; RV32-NEXT:    or t5, s1, t5
+; RV32-NEXT:    srli s1, t5, 2
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    srli s1, t5, 4
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    srli s1, t5, 8
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    srli s1, t5, 16
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    not t5, t5
+; RV32-NEXT:    srli s1, t5, 1
+; RV32-NEXT:    and t0, s1, t0
+; RV32-NEXT:    sub t0, t5, t0
+; RV32-NEXT:    and t5, t0, a7
+; RV32-NEXT:    srli t0, t0, 2
+; RV32-NEXT:    and a7, t0, a7
+; RV32-NEXT:    add a7, t5, a7
+; RV32-NEXT:    srli t0, a7, 4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    and a6, a7, a6
+; RV32-NEXT:    slli a7, a6, 8
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    slli a7, a6, 16
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    srli t5, a6, 24
+; RV32-NEXT:  .LBB1_13: # %_udiv-special-cases
+; RV32-NEXT:    or t0, t2, t1
+; RV32-NEXT:    or a6, s2, t4
+; RV32-NEXT:    and a7, s4, s0
+; RV32-NEXT:    or t6, t6, a1
+; RV32-NEXT:    addi s0, t5, 64
+; RV32-NEXT:    bnez t6, .LBB1_15
+; RV32-NEXT:  # %bb.14: # %_udiv-special-cases
+; RV32-NEXT:    mv s3, s0
+; RV32-NEXT:  .LBB1_15: # %_udiv-special-cases
+; RV32-NEXT:    seqz a1, t0
+; RV32-NEXT:    sltu t0, s0, t5
+; RV32-NEXT:    snez t5, t6
+; RV32-NEXT:    addi t5, t5, -1
+; RV32-NEXT:    and t0, t5, t0
+; RV32-NEXT:    sltu t5, t3, s3
+; RV32-NEXT:    seqz a6, a6
+; RV32-NEXT:    mv t6, t5
+; RV32-NEXT:    beq a7, t0, .LBB1_17
+; RV32-NEXT:  # %bb.16: # %_udiv-special-cases
+; RV32-NEXT:    sltu t6, a7, t0
+; RV32-NEXT:  .LBB1_17: # %_udiv-special-cases
+; RV32-NEXT:    or a1, a1, a6
+; RV32-NEXT:    andi a6, t6, 1
+; RV32-NEXT:    sub a7, a7, t0
+; RV32-NEXT:    sub t5, a7, t5
+; RV32-NEXT:    sub a7, t3, s3
+; RV32-NEXT:    beqz a6, .LBB1_19
+; RV32-NEXT:  # %bb.18: # %_udiv-special-cases
+; RV32-NEXT:    mv t0, a6
+; RV32-NEXT:    j .LBB1_20
+; RV32-NEXT:  .LBB1_19:
+; RV32-NEXT:    sltiu t0, a7, 65
+; RV32-NEXT:    xori t0, t0, 1
+; RV32-NEXT:    snez t3, t5
+; RV32-NEXT:    or t0, t0, t3
+; RV32-NEXT:  .LBB1_20: # %_udiv-special-cases
+; RV32-NEXT:    or t6, a1, t0
+; RV32-NEXT:    addi a1, t6, -1
+; RV32-NEXT:    and t3, t4, a1
+; RV32-NEXT:    and t0, a1, a2
+; RV32-NEXT:    and a1, a1, a5
+; RV32-NEXT:    bnez t6, .LBB1_30
+; RV32-NEXT:  # %bb.21: # %_udiv-special-cases
+; RV32-NEXT:    xori t6, a7, 64
+; RV32-NEXT:    or t6, t6, a6
+; RV32-NEXT:    or t6, t6, t5
+; RV32-NEXT:    beqz t6, .LBB1_30
+; RV32-NEXT:  # %bb.22: # %udiv-bb1
+; RV32-NEXT:    addi a1, a7, 1
+; RV32-NEXT:    sw zero, 32(sp)
+; RV32-NEXT:    sw zero, 36(sp)
+; RV32-NEXT:    sw zero, 40(sp)
+; RV32-NEXT:    sw zero, 44(sp)
+; RV32-NEXT:    sw a5, 48(sp)
+; RV32-NEXT:    sw a2, 52(sp)
+; RV32-NEXT:    sw t4, 56(sp)
+; RV32-NEXT:    li t0, 64
+; RV32-NEXT:    addi t3, sp, 48
+; RV32-NEXT:    neg s1, a7
+; RV32-NEXT:    seqz t6, a1
+; RV32-NEXT:    sub a7, t0, a7
+; RV32-NEXT:    add t5, t5, t6
+; RV32-NEXT:    andi t0, a7, 31
+; RV32-NEXT:    srli a7, a7, 3
+; RV32-NEXT:    or t6, a1, t5
+; RV32-NEXT:    xori s2, t0, 31
+; RV32-NEXT:    andi a7, a7, 12
+; RV32-NEXT:    seqz t0, t6
+; RV32-NEXT:    sub s3, t3, a7
+; RV32-NEXT:    add a6, a6, t0
+; RV32-NEXT:    lw t3, 0(s3)
+; RV32-NEXT:    lw s4, 4(s3)
+; RV32-NEXT:    andi a7, a6, 1
+; RV32-NEXT:    or t6, t6, a7
+; RV32-NEXT:    srli a6, t3, 1
+; RV32-NEXT:    sll t0, s4, s1
+; RV32-NEXT:    srl a6, a6, s2
+; RV32-NEXT:    or t0, t0, a6
+; RV32-NEXT:    sll a6, t3, s1
+; RV32-NEXT:    li t3, 0
+; RV32-NEXT:    beqz t6, .LBB1_28
+; RV32-NEXT:  # %bb.23: # %udiv-preheader
+; RV32-NEXT:    li t6, 0
+; RV32-NEXT:    li s0, 0
+; RV32-NEXT:    srli s4, s4, 1
+; RV32-NEXT:    lw s3, 8(s3)
+; RV32-NEXT:    sw zero, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    sw zero, 24(sp)
+; RV32-NEXT:    sw zero, 28(sp)
+; RV32-NEXT:    sw a5, 0(sp)
+; RV32-NEXT:    sw a2, 4(sp)
+; RV32-NEXT:    sw t4, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srl a5, s4, s2
+; RV32-NEXT:    mv t4, sp
+; RV32-NEXT:    snez t2, t2
+; RV32-NEXT:    andi a2, a2, 12
+; RV32-NEXT:    add t1, t1, t2
+; RV32-NEXT:    add a2, t4, a2
+; RV32-NEXT:    lw t2, 0(a2)
+; RV32-NEXT:    lw t4, 4(a2)
+; RV32-NEXT:    lw a2, 8(a2)
+; RV32-NEXT:    sll s1, s3, s1
+; RV32-NEXT:    andi s2, a1, 31
+; RV32-NEXT:    xori s2, s2, 31
+; RV32-NEXT:    or s3, s1, a5
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    slli a5, t4, 1
+; RV32-NEXT:    sll a2, a2, s2
+; RV32-NEXT:    sll s2, a5, s2
+; RV32-NEXT:    srl s1, t4, a1
+; RV32-NEXT:    or s1, s1, a2
+; RV32-NEXT:    seqz a2, a3
+; RV32-NEXT:    sub a2, a4, a2
+; RV32-NEXT:    addi a5, t1, 1
+; RV32-NEXT:    andi a5, a5, 1
+; RV32-NEXT:    andi s3, s3, 1
+; RV32-NEXT:    srl t1, t2, a1
+; RV32-NEXT:    or s2, t1, s2
+; RV32-NEXT:    addi t1, a3, -1
+; RV32-NEXT:    j .LBB1_26
+; RV32-NEXT:  .LBB1_24: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT:    sltu t2, a2, s4
+; RV32-NEXT:  .LBB1_25: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT:    srli s1, s1, 31
+; RV32-NEXT:    sub t4, a5, s1
+; RV32-NEXT:    sub t2, t4, t2
+; RV32-NEXT:    slli t2, t2, 31
+; RV32-NEXT:    srai s1, t2, 31
+; RV32-NEXT:    and s3, s1, a4
+; RV32-NEXT:    li t2, 0
+; RV32-NEXT:    li t4, 0
+; RV32-NEXT:    srli s5, a6, 31
+; RV32-NEXT:    sub s4, s4, s3
+; RV32-NEXT:    slli s3, t0, 1
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli t0, t0, 31
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    or a6, t3, a6
+; RV32-NEXT:    seqz t3, a1
+; RV32-NEXT:    or s0, s0, t0
+; RV32-NEXT:    or s5, a1, t5
+; RV32-NEXT:    sub t5, t5, t3
+; RV32-NEXT:    and s6, s1, a3
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi t3, s1, 1
+; RV32-NEXT:    or t0, t6, s3
+; RV32-NEXT:    sltu t6, s2, s6
+; RV32-NEXT:    snez s5, s5
+; RV32-NEXT:    andi s3, s0, 1
+; RV32-NEXT:    sub s1, s4, t6
+; RV32-NEXT:    add a7, a7, s5
+; RV32-NEXT:    addi a7, a7, 1
+; RV32-NEXT:    andi a7, a7, 1
+; RV32-NEXT:    or t6, a1, t5
+; RV32-NEXT:    or s4, t6, a7
+; RV32-NEXT:    sub s2, s2, s6
+; RV32-NEXT:    li t6, 0
+; RV32-NEXT:    li s0, 0
+; RV32-NEXT:    beqz s4, .LBB1_29
+; RV32-NEXT:  .LBB1_26: # %udiv-do-while
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    srli t2, s2, 31
+; RV32-NEXT:    slli t4, s1, 1
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or s4, t4, t2
+; RV32-NEXT:    andi t2, s3, 1
+; RV32-NEXT:    or s2, s2, t2
+; RV32-NEXT:    bne a2, s4, .LBB1_24
+; RV32-NEXT:  # %bb.27: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT:    sltu t2, t1, s2
+; RV32-NEXT:    j .LBB1_25
+; RV32-NEXT:  .LBB1_28:
+; RV32-NEXT:    li t2, 0
+; RV32-NEXT:    li t4, 0
+; RV32-NEXT:  .LBB1_29: # %udiv-loop-exit
+; RV32-NEXT:    srli a2, a6, 31
+; RV32-NEXT:    slli a3, t0, 1
+; RV32-NEXT:    srli a4, t0, 31
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    or a1, t3, a6
+; RV32-NEXT:    or a2, t2, a2
+; RV32-NEXT:    or a4, t4, a4
+; RV32-NEXT:    or t0, a2, a3
+; RV32-NEXT:    andi t3, a4, 1
+; RV32-NEXT:  .LBB1_30: # %udiv-end
+; RV32-NEXT:    andi a2, t3, 1
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw t0, 4(a0)
+; RV32-NEXT:    sb a2, 8(a0)
+; RV32-NEXT:    lw s0, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i65:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    andi a3, a3, 1
+; RV64-NEXT:    call __udivti3
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %res = udiv i65 %x, %y
+  ret i65 %res
+}
 
 define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: udiv_i128:
-; CHECK:    call __udivti3
+; RV32-LABEL: udiv_i128:
+; RV32:       # %bb.0: # %_udiv-special-cases
+; RV32-NEXT:    addi sp, sp, -160
+; RV32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s7, a0
+; RV32-NEXT:    lw s8, 0(a2)
+; RV32-NEXT:    lw s9, 4(a2)
+; RV32-NEXT:    lw s11, 8(a2)
+; RV32-NEXT:    lw ra, 12(a2)
+; RV32-NEXT:    lui t4, 349525
+; RV32-NEXT:    addi t4, t4, 1365
+; RV32-NEXT:    lui t3, 209715
+; RV32-NEXT:    addi t3, t3, 819
+; RV32-NEXT:    lui t2, 61681
+; RV32-NEXT:    addi t2, t2, -241
+; RV32-NEXT:    bnez s9, .LBB2_2
+; RV32-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV32-NEXT:    srli a0, s8, 1
+; RV32-NEXT:    or a0, s8, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli a0, a0, 24
+; RV32-NEXT:    addi t6, a0, 32
+; RV32-NEXT:    j .LBB2_3
+; RV32-NEXT:  .LBB2_2:
+; RV32-NEXT:    srli a0, s9, 1
+; RV32-NEXT:    or a0, s9, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli t6, a0, 24
+; RV32-NEXT:  .LBB2_3: # %_udiv-special-cases
+; RV32-NEXT:    lw a6, 4(a1)
+; RV32-NEXT:    or s0, s11, ra
+; RV32-NEXT:    bnez ra, .LBB2_5
+; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV32-NEXT:    srli a0, s11, 1
+; RV32-NEXT:    or a0, s11, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli a0, a0, 24
+; RV32-NEXT:    addi t5, a0, 32
+; RV32-NEXT:    j .LBB2_6
+; RV32-NEXT:  .LBB2_5:
+; RV32-NEXT:    srli a0, ra, 1
+; RV32-NEXT:    or a0, ra, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli t5, a0, 24
+; RV32-NEXT:  .LBB2_6: # %_udiv-special-cases
+; RV32-NEXT:    lw a7, 12(a1)
+; RV32-NEXT:    addi a0, t6, 64
+; RV32-NEXT:    bnez s0, .LBB2_8
+; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV32-NEXT:    mv t5, a0
+; RV32-NEXT:  .LBB2_8: # %_udiv-special-cases
+; RV32-NEXT:    lw t1, 0(a1)
+; RV32-NEXT:    lw t0, 8(a1)
+; RV32-NEXT:    snez s3, s0
+; RV32-NEXT:    bnez a6, .LBB2_10
+; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV32-NEXT:    srli a1, t1, 1
+; RV32-NEXT:    or a1, t1, a1
+; RV32-NEXT:    srli a3, a1, 2
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 8
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 16
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    not a1, a1
+; RV32-NEXT:    srli a3, a1, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a1, a1, a3
+; RV32-NEXT:    and a3, a1, t3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    and a1, a1, t3
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    and a1, a1, t2
+; RV32-NEXT:    slli a3, a1, 8
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 16
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    srli a1, a1, 24
+; RV32-NEXT:    addi a3, a1, 32
+; RV32-NEXT:    j .LBB2_11
+; RV32-NEXT:  .LBB2_10:
+; RV32-NEXT:    srli a1, a6, 1
+; RV32-NEXT:    or a1, a6, a1
+; RV32-NEXT:    srli a3, a1, 2
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 8
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 16
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    not a1, a1
+; RV32-NEXT:    srli a3, a1, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a1, a1, a3
+; RV32-NEXT:    and a3, a1, t3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    and a1, a1, t3
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    and a1, a1, t2
+; RV32-NEXT:    slli a3, a1, 8
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 16
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 24
+; RV32-NEXT:  .LBB2_11: # %_udiv-special-cases
+; RV32-NEXT:    or a1, s9, ra
+; RV32-NEXT:    or s0, s8, s11
+; RV32-NEXT:    or s1, a6, a7
+; RV32-NEXT:    or s2, t1, t0
+; RV32-NEXT:    sltu t6, a0, t6
+; RV32-NEXT:    addi s3, s3, -1
+; RV32-NEXT:    addi a0, a3, 64
+; RV32-NEXT:    or s4, t0, a7
+; RV32-NEXT:    sltu s5, a0, a3
+; RV32-NEXT:    snez s6, s4
+; RV32-NEXT:    addi s6, s6, -1
+; RV32-NEXT:    bnez a7, .LBB2_13
+; RV32-NEXT:  # %bb.12: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, t0, 1
+; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t4
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t3
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t2
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:    addi a3, a3, 32
+; RV32-NEXT:    j .LBB2_14
+; RV32-NEXT:  .LBB2_13:
+; RV32-NEXT:    srli a3, a7, 1
+; RV32-NEXT:    or a3, a7, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t4
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t3
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t2
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:  .LBB2_14: # %_udiv-special-cases
+; RV32-NEXT:    or s0, s0, a1
+; RV32-NEXT:    or a5, s2, s1
+; RV32-NEXT:    and a1, s3, t6
+; RV32-NEXT:    and a4, s6, s5
+; RV32-NEXT:    bnez s4, .LBB2_16
+; RV32-NEXT:  # %bb.15: # %_udiv-special-cases
+; RV32-NEXT:    mv a3, a0
+; RV32-NEXT:  .LBB2_16: # %_udiv-special-cases
+; RV32-NEXT:    seqz a0, s0
+; RV32-NEXT:    seqz a5, a5
+; RV32-NEXT:    sltu t2, t5, a3
+; RV32-NEXT:    sub t4, a1, a4
+; RV32-NEXT:    mv t3, t2
+; RV32-NEXT:    beq a1, a4, .LBB2_18
+; RV32-NEXT:  # %bb.17: # %_udiv-special-cases
+; RV32-NEXT:    sltu t3, a1, a4
+; RV32-NEXT:  .LBB2_18: # %_udiv-special-cases
+; RV32-NEXT:    sub t2, t4, t2
+; RV32-NEXT:    or a0, a0, a5
+; RV32-NEXT:    neg t4, t3
+; RV32-NEXT:    seqz t6, t3
+; RV32-NEXT:    addi t6, t6, -1
+; RV32-NEXT:    or a1, t4, t6
+; RV32-NEXT:    sub t3, t5, a3
+; RV32-NEXT:    beqz a1, .LBB2_20
+; RV32-NEXT:  # %bb.19: # %_udiv-special-cases
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    j .LBB2_21
+; RV32-NEXT:  .LBB2_20:
+; RV32-NEXT:    snez a1, t2
+; RV32-NEXT:    sltiu a3, t3, 128
+; RV32-NEXT:    xori a3, a3, 1
+; RV32-NEXT:    or a1, a3, a1
+; RV32-NEXT:  .LBB2_21: # %_udiv-special-cases
+; RV32-NEXT:    or a5, a0, a1
+; RV32-NEXT:    addi a3, a5, -1
+; RV32-NEXT:    and a0, a3, a7
+; RV32-NEXT:    and a1, a3, t0
+; RV32-NEXT:    and a4, a3, a6
+; RV32-NEXT:    and a3, a3, t1
+; RV32-NEXT:    bnez a5, .LBB2_26
+; RV32-NEXT:  # %bb.22: # %_udiv-special-cases
+; RV32-NEXT:    xori a5, t3, 127
+; RV32-NEXT:    or a5, a5, t4
+; RV32-NEXT:    or t5, t2, t6
+; RV32-NEXT:    or a5, a5, t5
+; RV32-NEXT:    beqz a5, .LBB2_26
+; RV32-NEXT:  # %bb.23: # %udiv-bb1
+; RV32-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi a1, t3, 1
+; RV32-NEXT:    sw zero, 72(sp)
+; RV32-NEXT:    sw zero, 76(sp)
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw zero, 84(sp)
+; RV32-NEXT:    sw t1, 88(sp)
+; RV32-NEXT:    sw a6, 92(sp)
+; RV32-NEXT:    sw t0, 96(sp)
+; RV32-NEXT:    sw a7, 100(sp)
+; RV32-NEXT:    li a0, 127
+; RV32-NEXT:    addi a2, sp, 88
+; RV32-NEXT:    seqz a3, a1
+; RV32-NEXT:    sub a0, a0, t3
+; RV32-NEXT:    add t2, t2, a3
+; RV32-NEXT:    andi a3, a0, 31
+; RV32-NEXT:    srli a0, a0, 3
+; RV32-NEXT:    or a4, a1, t2
+; RV32-NEXT:    xori a3, a3, 31
+; RV32-NEXT:    andi a0, a0, 12
+; RV32-NEXT:    seqz t5, a4
+; RV32-NEXT:    sub a2, a2, a0
+; RV32-NEXT:    add t5, t4, t5
+; RV32-NEXT:    lw a0, 0(a2)
+; RV32-NEXT:    lw a4, 4(a2)
+; RV32-NEXT:    lw a5, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    sltu t4, t5, t4
+; RV32-NEXT:    or s0, a1, t5
+; RV32-NEXT:    add t4, t6, t4
+; RV32-NEXT:    or t6, t2, t4
+; RV32-NEXT:    or s0, s0, t6
+; RV32-NEXT:    srli t6, a5, 1
+; RV32-NEXT:    srli s1, a4, 1
+; RV32-NEXT:    srli s2, a0, 1
+; RV32-NEXT:    srl t6, t6, a3
+; RV32-NEXT:    srl s1, s1, a3
+; RV32-NEXT:    srl a3, s2, a3
+; RV32-NEXT:    not t3, t3
+; RV32-NEXT:    sll a2, a2, t3
+; RV32-NEXT:    or s2, a2, t6
+; RV32-NEXT:    sll a2, a5, t3
+; RV32-NEXT:    sll a4, a4, t3
+; RV32-NEXT:    or s1, a2, s1
+; RV32-NEXT:    or t6, a4, a3
+; RV32-NEXT:    sll t3, a0, t3
+; RV32-NEXT:    bnez s0, .LBB2_27
+; RV32-NEXT:  # %bb.24:
+; RV32-NEXT:    li s6, 0
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:  .LBB2_25: # %udiv-loop-exit
+; RV32-NEXT:    srli a0, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or a0, s2, a0
+; RV32-NEXT:    srli a1, t6, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    or a1, s1, a1
+; RV32-NEXT:    srli a2, t3, 31
+; RV32-NEXT:    slli t6, t6, 1
+; RV32-NEXT:    slli a3, t3, 1
+; RV32-NEXT:    or a3, s0, a3
+; RV32-NEXT:    or a2, s6, a2
+; RV32-NEXT:    or a4, a2, t6
+; RV32-NEXT:    or a1, s7, a1
+; RV32-NEXT:    or a0, s8, a0
+; RV32-NEXT:    lw s7, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:  .LBB2_26: # %udiv-end
+; RV32-NEXT:    sw a3, 0(s7)
+; RV32-NEXT:    sw a4, 4(s7)
+; RV32-NEXT:    sw a1, 8(s7)
+; RV32-NEXT:    sw a0, 12(s7)
+; RV32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 160
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB2_27: # %udiv-preheader
+; RV32-NEXT:    li s0, 0
+; RV32-NEXT:    li s5, 0
+; RV32-NEXT:    li s3, 0
+; RV32-NEXT:    li s4, 0
+; RV32-NEXT:    sw zero, 56(sp)
+; RV32-NEXT:    sw zero, 60(sp)
+; RV32-NEXT:    sw zero, 64(sp)
+; RV32-NEXT:    sw zero, 68(sp)
+; RV32-NEXT:    sw t1, 40(sp)
+; RV32-NEXT:    sw a6, 44(sp)
+; RV32-NEXT:    sw t0, 48(sp)
+; RV32-NEXT:    sw a7, 52(sp)
+; RV32-NEXT:    srli a0, a1, 3
+; RV32-NEXT:    addi a2, sp, 40
+; RV32-NEXT:    andi a0, a0, 12
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 12(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    andi a5, a1, 31
+; RV32-NEXT:    xori a5, a5, 31
+; RV32-NEXT:    slli a6, a4, 1
+; RV32-NEXT:    slli a7, a3, 1
+; RV32-NEXT:    slli t0, a2, 1
+; RV32-NEXT:    sll a6, a6, a5
+; RV32-NEXT:    sll a7, a7, a5
+; RV32-NEXT:    sll a5, t0, a5
+; RV32-NEXT:    seqz t0, s8
+; RV32-NEXT:    srl a3, a3, a1
+; RV32-NEXT:    or s10, a3, a6
+; RV32-NEXT:    or a3, s8, s9
+; RV32-NEXT:    sw s9, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sub a6, s9, t0
+; RV32-NEXT:    seqz a3, a3
+; RV32-NEXT:    srl a2, a2, a1
+; RV32-NEXT:    or s9, a2, a7
+; RV32-NEXT:    sub a7, s11, a3
+; RV32-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a2, s11, a3
+; RV32-NEXT:    sw ra, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sub a2, ra, a2
+; RV32-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    srl a0, a0, a1
+; RV32-NEXT:    srl ra, a4, a1
+; RV32-NEXT:    or t1, a0, a5
+; RV32-NEXT:    sw s8, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s8, s8, -1
+; RV32-NEXT:    sw s8, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:    j .LBB2_29
+; RV32-NEXT:  .LBB2_28: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    li s6, 0
+; RV32-NEXT:    sub a0, a0, a5
+; RV32-NEXT:    srli a5, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or a5, s2, a5
+; RV32-NEXT:    srli s2, t6, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    or s1, s1, s2
+; RV32-NEXT:    srli s2, t3, 31
+; RV32-NEXT:    slli t6, t6, 1
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    or t6, t6, s2
+; RV32-NEXT:    lw a2, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and s2, s10, a2
+; RV32-NEXT:    or t3, s0, t3
+; RV32-NEXT:    sub a2, a3, s2
+; RV32-NEXT:    sltu a3, a3, s2
+; RV32-NEXT:    lw t0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and s0, s10, t0
+; RV32-NEXT:    sub t0, s9, s0
+; RV32-NEXT:    or s2, a1, t2
+; RV32-NEXT:    sub s9, a0, a4
+; RV32-NEXT:    seqz a0, a1
+; RV32-NEXT:    sub t2, t2, a0
+; RV32-NEXT:    or t6, s5, t6
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi s0, s10, 1
+; RV32-NEXT:    seqz a0, s2
+; RV32-NEXT:    or s1, s3, s1
+; RV32-NEXT:    or s2, s4, a5
+; RV32-NEXT:    sub s10, a2, ra
+; RV32-NEXT:    sltu a2, a2, ra
+; RV32-NEXT:    sub a3, t0, a3
+; RV32-NEXT:    sltu a4, t5, a0
+; RV32-NEXT:    sub t5, t5, a0
+; RV32-NEXT:    sub ra, a3, a2
+; RV32-NEXT:    sub t4, t4, a4
+; RV32-NEXT:    or a0, t2, t4
+; RV32-NEXT:    or a2, a1, t5
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    sub t1, s11, t1
+; RV32-NEXT:    li s5, 0
+; RV32-NEXT:    li s3, 0
+; RV32-NEXT:    li s4, 0
+; RV32-NEXT:    beqz a0, .LBB2_25
+; RV32-NEXT:  .LBB2_29: # %udiv-do-while
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    srli a0, t1, 31
+; RV32-NEXT:    slli a3, s9, 1
+; RV32-NEXT:    slli t1, t1, 1
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    srli a3, s2, 31
+; RV32-NEXT:    or s11, t1, a3
+; RV32-NEXT:    beq a6, a0, .LBB2_31
+; RV32-NEXT:  # %bb.30: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    sltu a4, a6, a0
+; RV32-NEXT:    j .LBB2_32
+; RV32-NEXT:  .LBB2_31: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    lw a2, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a4, a2, s11
+; RV32-NEXT:  .LBB2_32: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    srli a3, s10, 31
+; RV32-NEXT:    slli ra, ra, 1
+; RV32-NEXT:    srli a5, s9, 31
+; RV32-NEXT:    slli s10, s10, 1
+; RV32-NEXT:    or s9, ra, a3
+; RV32-NEXT:    or a3, s10, a5
+; RV32-NEXT:    sub a5, a7, a3
+; RV32-NEXT:    sltu t1, a7, a3
+; RV32-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub s6, t0, s9
+; RV32-NEXT:    sltu a4, a5, a4
+; RV32-NEXT:    sub a5, s6, t1
+; RV32-NEXT:    sub a5, a5, a4
+; RV32-NEXT:    srai s10, a5, 31
+; RV32-NEXT:    and t1, s10, a2
+; RV32-NEXT:    lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a5, s10, a2
+; RV32-NEXT:    sltu a4, s11, t1
+; RV32-NEXT:    mv ra, a4
+; RV32-NEXT:    beq a0, a5, .LBB2_28
+; RV32-NEXT:  # %bb.33: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    sltu ra, a0, a5
+; RV32-NEXT:    j .LBB2_28
+;
+; RV64-LABEL: udiv_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    call __udivti3
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
   %res = udiv i128 %x, %y
   ret i128 %res
 }
 
 define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
-; CHECK-LABEL: udiv_i129:
-; CHECK-NOT: call{{.*}}div
+; RV32-LABEL: udiv_i129:
+; RV32:       # %bb.0: # %_udiv-special-cases
+; RV32-NEXT:    addi sp, sp, -240
+; RV32-NEXT:    sw ra, 236(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 232(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 228(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 224(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 220(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 216(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 212(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 208(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 204(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 200(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 196(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 192(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv ra, a0
+; RV32-NEXT:    lw t2, 16(a2)
+; RV32-NEXT:    lw a4, 0(a2)
+; RV32-NEXT:    lw a5, 4(a2)
+; RV32-NEXT:    lw a6, 8(a2)
+; RV32-NEXT:    lw a0, 12(a2)
+; RV32-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi t5, a0, 1365
+; RV32-NEXT:    addi t4, a2, 819
+; RV32-NEXT:    addi t3, a3, -241
+; RV32-NEXT:    sw a6, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a0, a6, 31
+; RV32-NEXT:    srli a2, a5, 1
+; RV32-NEXT:    sw a5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a3, a5, 31
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    sw a4, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    srli a2, a4, 1
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    bnez a0, .LBB3_2
+; RV32-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    or a3, a2, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:    addi a6, a3, 32
+; RV32-NEXT:    j .LBB3_3
+; RV32-NEXT:  .LBB3_2:
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    or a3, a0, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a6, a3, 24
+; RV32-NEXT:  .LBB3_3: # %_udiv-special-cases
+; RV32-NEXT:    lw a7, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    srli a3, a7, 1
+; RV32-NEXT:    slli a5, t2, 31
+; RV32-NEXT:    slli a7, a7, 31
+; RV32-NEXT:    lw a4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    srli t0, a4, 1
+; RV32-NEXT:    lw a4, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli a4, a4, 31
+; RV32-NEXT:    li s2, 64
+; RV32-NEXT:    bnez a4, .LBB3_5
+; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV32-NEXT:    li t6, 64
+; RV32-NEXT:    j .LBB3_6
+; RV32-NEXT:  .LBB3_5:
+; RV32-NEXT:    srli t1, a4, 1
+; RV32-NEXT:    or t1, a4, t1
+; RV32-NEXT:    srli t6, t1, 2
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 4
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 8
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 16
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    not t1, t1
+; RV32-NEXT:    srli t6, t1, 1
+; RV32-NEXT:    and t6, t6, t5
+; RV32-NEXT:    sub t1, t1, t6
+; RV32-NEXT:    and t6, t1, t4
+; RV32-NEXT:    srli t1, t1, 2
+; RV32-NEXT:    and t1, t1, t4
+; RV32-NEXT:    add t1, t6, t1
+; RV32-NEXT:    srli t6, t1, 4
+; RV32-NEXT:    add t1, t1, t6
+; RV32-NEXT:    and t1, t1, t3
+; RV32-NEXT:    slli t6, t1, 8
+; RV32-NEXT:    add t1, t1, t6
+; RV32-NEXT:    slli t6, t1, 16
+; RV32-NEXT:    add t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 24
+; RV32-NEXT:  .LBB3_6: # %_udiv-special-cases
+; RV32-NEXT:    or t1, a5, a3
+; RV32-NEXT:    or a7, t0, a7
+; RV32-NEXT:    bnez a4, .LBB3_8
+; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV32-NEXT:    li t6, 128
+; RV32-NEXT:  .LBB3_8: # %_udiv-special-cases
+; RV32-NEXT:    or a5, a7, t1
+; RV32-NEXT:    addi a4, a6, 64
+; RV32-NEXT:    addi a3, t6, 128
+; RV32-NEXT:    or a0, a0, t1
+; RV32-NEXT:    or a2, a2, a7
+; RV32-NEXT:    or s3, a2, a0
+; RV32-NEXT:    sltu s0, a3, t6
+; RV32-NEXT:    bnez s3, .LBB3_11
+; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV32-NEXT:    mv t6, s0
+; RV32-NEXT:    beqz t1, .LBB3_12
+; RV32-NEXT:  .LBB3_10:
+; RV32-NEXT:    srli a0, t1, 1
+; RV32-NEXT:    or a0, t1, a0
+; RV32-NEXT:    srli a2, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 8
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 16
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a2, a0, 1
+; RV32-NEXT:    and a2, a2, t5
+; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    and a2, a0, t4
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t4
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    slli a2, a0, 8
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    slli a2, a0, 16
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    srli s1, a0, 24
+; RV32-NEXT:    beqz a5, .LBB3_13
+; RV32-NEXT:    j .LBB3_14
+; RV32-NEXT:  .LBB3_11:
+; RV32-NEXT:    snez a0, a5
+; RV32-NEXT:    sltu a2, a4, a6
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and t6, a0, a2
+; RV32-NEXT:    bnez t1, .LBB3_10
+; RV32-NEXT:  .LBB3_12: # %_udiv-special-cases
+; RV32-NEXT:    srli a0, a7, 1
+; RV32-NEXT:    or a0, a7, a0
+; RV32-NEXT:    srli a2, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 8
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 16
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a2, a0, 1
+; RV32-NEXT:    and a2, a2, t5
+; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    and a2, a0, t4
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t4
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    slli a2, a0, 8
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    slli a2, a0, 16
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    srli a0, a0, 24
+; RV32-NEXT:    addi s1, a0, 32
+; RV32-NEXT:    bnez a5, .LBB3_14
+; RV32-NEXT:  .LBB3_13: # %_udiv-special-cases
+; RV32-NEXT:    mv s1, a4
+; RV32-NEXT:  .LBB3_14: # %_udiv-special-cases
+; RV32-NEXT:    lw a7, 0(a1)
+; RV32-NEXT:    lw t0, 4(a1)
+; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    bnez s3, .LBB3_16
+; RV32-NEXT:  # %bb.15: # %_udiv-special-cases
+; RV32-NEXT:    mv s1, a3
+; RV32-NEXT:  .LBB3_16: # %_udiv-special-cases
+; RV32-NEXT:    lw t1, 12(a1)
+; RV32-NEXT:    lw a1, 16(a1)
+; RV32-NEXT:    slli a0, a6, 31
+; RV32-NEXT:    srli a2, t0, 1
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    slli a2, t0, 31
+; RV32-NEXT:    srli a3, a7, 1
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    bnez a0, .LBB3_18
+; RV32-NEXT:  # %bb.17: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    or a3, a2, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:    addi s5, a3, 32
+; RV32-NEXT:    j .LBB3_19
+; RV32-NEXT:  .LBB3_18:
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    or a3, a0, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli s5, a3, 24
+; RV32-NEXT:  .LBB3_19: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, t1, 1
+; RV32-NEXT:    slli a4, a1, 31
+; RV32-NEXT:    slli a5, t1, 31
+; RV32-NEXT:    slli s4, a7, 31
+; RV32-NEXT:    srli s6, a6, 1
+; RV32-NEXT:    beqz s4, .LBB3_21
+; RV32-NEXT:  # %bb.20:
+; RV32-NEXT:    srli s2, s4, 1
+; RV32-NEXT:    or s2, s4, s2
+; RV32-NEXT:    srli s7, s2, 2
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    srli s7, s2, 4
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    srli s7, s2, 8
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    srli s7, s2, 16
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    not s2, s2
+; RV32-NEXT:    srli s7, s2, 1
+; RV32-NEXT:    and s7, s7, t5
+; RV32-NEXT:    sub s2, s2, s7
+; RV32-NEXT:    and s7, s2, t4
+; RV32-NEXT:    srli s2, s2, 2
+; RV32-NEXT:    and s2, s2, t4
+; RV32-NEXT:    add s2, s7, s2
+; RV32-NEXT:    srli s7, s2, 4
+; RV32-NEXT:    add s2, s2, s7
+; RV32-NEXT:    and s2, s2, t3
+; RV32-NEXT:    slli s7, s2, 8
+; RV32-NEXT:    add s2, s2, s7
+; RV32-NEXT:    slli s7, s2, 16
+; RV32-NEXT:    add s2, s2, s7
+; RV32-NEXT:    srli s2, s2, 24
+; RV32-NEXT:  .LBB3_21: # %_udiv-special-cases
+; RV32-NEXT:    or s7, a4, a3
+; RV32-NEXT:    or s6, s6, a5
+; RV32-NEXT:    bnez s4, .LBB3_23
+; RV32-NEXT:  # %bb.22: # %_udiv-special-cases
+; RV32-NEXT:    li s2, 128
+; RV32-NEXT:  .LBB3_23: # %_udiv-special-cases
+; RV32-NEXT:    or s4, s6, s7
+; RV32-NEXT:    addi a5, s5, 64
+; RV32-NEXT:    addi a3, s2, 128
+; RV32-NEXT:    or a0, a0, s7
+; RV32-NEXT:    or a4, a2, s6
+; RV32-NEXT:    or a4, a4, a0
+; RV32-NEXT:    sltu a0, a3, s2
+; RV32-NEXT:    bnez a4, .LBB3_26
+; RV32-NEXT:  # %bb.24: # %_udiv-special-cases
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    snez s2, s3
+; RV32-NEXT:    beqz s7, .LBB3_27
+; RV32-NEXT:  .LBB3_25:
+; RV32-NEXT:    srli s3, s7, 1
+; RV32-NEXT:    or s3, s7, s3
+; RV32-NEXT:    srli s5, s3, 2
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 4
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 8
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 16
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    not s3, s3
+; RV32-NEXT:    srli s5, s3, 1
+; RV32-NEXT:    and t5, s5, t5
+; RV32-NEXT:    sub t5, s3, t5
+; RV32-NEXT:    and s3, t5, t4
+; RV32-NEXT:    srli t5, t5, 2
+; RV32-NEXT:    and t4, t5, t4
+; RV32-NEXT:    add t4, s3, t4
+; RV32-NEXT:    srli t5, t4, 4
+; RV32-NEXT:    add t4, t4, t5
+; RV32-NEXT:    and t3, t4, t3
+; RV32-NEXT:    slli t4, t3, 8
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    slli t4, t3, 16
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    srli t3, t3, 24
+; RV32-NEXT:    j .LBB3_28
+; RV32-NEXT:  .LBB3_26:
+; RV32-NEXT:    snez a2, s4
+; RV32-NEXT:    sltu s2, a5, s5
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, s2
+; RV32-NEXT:    snez s2, s3
+; RV32-NEXT:    bnez s7, .LBB3_25
+; RV32-NEXT:  .LBB3_27: # %_udiv-special-cases
+; RV32-NEXT:    srli s3, s6, 1
+; RV32-NEXT:    or s3, s6, s3
+; RV32-NEXT:    srli s5, s3, 2
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 4
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 8
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 16
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    not s3, s3
+; RV32-NEXT:    srli s5, s3, 1
+; RV32-NEXT:    and t5, s5, t5
+; RV32-NEXT:    sub t5, s3, t5
+; RV32-NEXT:    and s3, t5, t4
+; RV32-NEXT:    srli t5, t5, 2
+; RV32-NEXT:    and t4, t5, t4
+; RV32-NEXT:    add t4, s3, t4
+; RV32-NEXT:    srli t5, t4, 4
+; RV32-NEXT:    add t4, t4, t5
+; RV32-NEXT:    and t3, t4, t3
+; RV32-NEXT:    slli t4, t3, 8
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    slli t4, t3, 16
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    srli t3, t3, 24
+; RV32-NEXT:    addi t3, t3, 32
+; RV32-NEXT:  .LBB3_28: # %_udiv-special-cases
+; RV32-NEXT:    xori t4, s0, 1
+; RV32-NEXT:    addi s2, s2, -1
+; RV32-NEXT:    bnez s4, .LBB3_30
+; RV32-NEXT:  # %bb.29: # %_udiv-special-cases
+; RV32-NEXT:    mv t3, a5
+; RV32-NEXT:  .LBB3_30: # %_udiv-special-cases
+; RV32-NEXT:    andi s11, a1, 1
+; RV32-NEXT:    andi s8, t2, 1
+; RV32-NEXT:    lw a1, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s9, a1, a5
+; RV32-NEXT:    or t2, a7, a6
+; RV32-NEXT:    neg a1, t4
+; RV32-NEXT:    and s0, s2, s0
+; RV32-NEXT:    bnez a4, .LBB3_32
+; RV32-NEXT:  # %bb.31: # %_udiv-special-cases
+; RV32-NEXT:    mv t3, a3
+; RV32-NEXT:  .LBB3_32: # %_udiv-special-cases
+; RV32-NEXT:    lw a3, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s10, a3, a5
+; RV32-NEXT:    or a5, s9, s8
+; RV32-NEXT:    or t4, t0, t1
+; RV32-NEXT:    or t5, t2, s11
+; RV32-NEXT:    and a1, s0, a1
+; RV32-NEXT:    xori a3, a0, 1
+; RV32-NEXT:    snez a4, a4
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    and a0, a4, a0
+; RV32-NEXT:    sltu a4, s1, t3
+; RV32-NEXT:    and t2, a0, a3
+; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    beq t6, a2, .LBB3_34
+; RV32-NEXT:  # %bb.33: # %_udiv-special-cases
+; RV32-NEXT:    sltu a3, t6, a2
+; RV32-NEXT:  .LBB3_34: # %_udiv-special-cases
+; RV32-NEXT:    or a0, a5, s10
+; RV32-NEXT:    or t5, t5, t4
+; RV32-NEXT:    sltu t4, a1, t2
+; RV32-NEXT:    mv s0, a3
+; RV32-NEXT:    beq a1, t2, .LBB3_36
+; RV32-NEXT:  # %bb.35: # %_udiv-special-cases
+; RV32-NEXT:    mv s0, t4
+; RV32-NEXT:  .LBB3_36: # %_udiv-special-cases
+; RV32-NEXT:    seqz a5, a0
+; RV32-NEXT:    seqz t5, t5
+; RV32-NEXT:    andi a0, s0, 1
+; RV32-NEXT:    sub a2, t6, a2
+; RV32-NEXT:    sub a1, a1, t2
+; RV32-NEXT:    sub t2, a2, a4
+; RV32-NEXT:    sltu a2, a1, a3
+; RV32-NEXT:    add a2, t4, a2
+; RV32-NEXT:    neg t4, a2
+; RV32-NEXT:    sub a4, a1, a3
+; RV32-NEXT:    or a1, a4, t4
+; RV32-NEXT:    sub a3, s1, t3
+; RV32-NEXT:    beqz a1, .LBB3_38
+; RV32-NEXT:  # %bb.37: # %_udiv-special-cases
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    or a2, a5, t5
+; RV32-NEXT:    bnez a0, .LBB3_39
+; RV32-NEXT:    j .LBB3_40
+; RV32-NEXT:  .LBB3_38:
+; RV32-NEXT:    snez a1, t2
+; RV32-NEXT:    sltiu a2, a3, 129
+; RV32-NEXT:    xori a2, a2, 1
+; RV32-NEXT:    or a1, a2, a1
+; RV32-NEXT:    or a2, a5, t5
+; RV32-NEXT:    beqz a0, .LBB3_40
+; RV32-NEXT:  .LBB3_39: # %_udiv-special-cases
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:  .LBB3_40: # %_udiv-special-cases
+; RV32-NEXT:    or t6, a2, a1
+; RV32-NEXT:    addi a1, t6, -1
+; RV32-NEXT:    and a2, s11, a1
+; RV32-NEXT:    and a5, a1, t1
+; RV32-NEXT:    and t3, a1, a6
+; RV32-NEXT:    and t5, a1, t0
+; RV32-NEXT:    and a1, a1, a7
+; RV32-NEXT:    bnez t6, .LBB3_57
+; RV32-NEXT:  # %bb.41: # %_udiv-special-cases
+; RV32-NEXT:    or t6, t2, t4
+; RV32-NEXT:    xori s0, a3, 128
+; RV32-NEXT:    or s0, s0, a0
+; RV32-NEXT:    or s0, s0, a4
+; RV32-NEXT:    or t6, s0, t6
+; RV32-NEXT:    beqz t6, .LBB3_57
+; RV32-NEXT:  # %bb.42: # %udiv-bb1
+; RV32-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi a1, a3, 1
+; RV32-NEXT:    sw zero, 136(sp)
+; RV32-NEXT:    sw zero, 140(sp)
+; RV32-NEXT:    sw zero, 144(sp)
+; RV32-NEXT:    sw zero, 148(sp)
+; RV32-NEXT:    sw zero, 120(sp)
+; RV32-NEXT:    sw zero, 124(sp)
+; RV32-NEXT:    sw zero, 128(sp)
+; RV32-NEXT:    sw zero, 132(sp)
+; RV32-NEXT:    sw a7, 152(sp)
+; RV32-NEXT:    sw t0, 156(sp)
+; RV32-NEXT:    sw a6, 160(sp)
+; RV32-NEXT:    sw t1, 164(sp)
+; RV32-NEXT:    sw s11, 168(sp)
+; RV32-NEXT:    li a5, 128
+; RV32-NEXT:    addi t3, sp, 152
+; RV32-NEXT:    neg a2, a3
+; RV32-NEXT:    seqz t5, a1
+; RV32-NEXT:    sub a5, a5, a3
+; RV32-NEXT:    add t2, t2, t5
+; RV32-NEXT:    andi a3, a5, 31
+; RV32-NEXT:    srli t5, a5, 3
+; RV32-NEXT:    or t6, a1, t2
+; RV32-NEXT:    xori a5, a3, 31
+; RV32-NEXT:    andi a3, t5, 28
+; RV32-NEXT:    seqz t6, t6
+; RV32-NEXT:    sub ra, t3, a3
+; RV32-NEXT:    add t6, a4, t6
+; RV32-NEXT:    lw t3, 0(ra)
+; RV32-NEXT:    lw s0, 4(ra)
+; RV32-NEXT:    lw s1, 8(ra)
+; RV32-NEXT:    lw a3, 12(ra)
+; RV32-NEXT:    sltu a4, t6, a4
+; RV32-NEXT:    or t5, a1, t6
+; RV32-NEXT:    add t4, t4, a4
+; RV32-NEXT:    or a4, t2, t4
+; RV32-NEXT:    or a4, t5, a4
+; RV32-NEXT:    srli t5, s1, 1
+; RV32-NEXT:    seqz s2, a4
+; RV32-NEXT:    add a0, a0, s2
+; RV32-NEXT:    sll s2, a3, a2
+; RV32-NEXT:    srl t5, t5, a5
+; RV32-NEXT:    or t5, s2, t5
+; RV32-NEXT:    srli s2, s0, 1
+; RV32-NEXT:    sll s1, s1, a2
+; RV32-NEXT:    srl s2, s2, a5
+; RV32-NEXT:    or s2, s1, s2
+; RV32-NEXT:    srli s1, t3, 1
+; RV32-NEXT:    sll s0, s0, a2
+; RV32-NEXT:    srl s1, s1, a5
+; RV32-NEXT:    andi s3, a0, 1
+; RV32-NEXT:    or s1, s0, s1
+; RV32-NEXT:    or a0, a4, s3
+; RV32-NEXT:    sll t3, t3, a2
+; RV32-NEXT:    beqz a0, .LBB3_55
+; RV32-NEXT:  # %bb.43: # %udiv-preheader
+; RV32-NEXT:    sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    lw a0, 16(ra)
+; RV32-NEXT:    sw zero, 104(sp)
+; RV32-NEXT:    sw zero, 108(sp)
+; RV32-NEXT:    sw zero, 112(sp)
+; RV32-NEXT:    sw zero, 116(sp)
+; RV32-NEXT:    sw zero, 88(sp)
+; RV32-NEXT:    sw zero, 92(sp)
+; RV32-NEXT:    sw zero, 96(sp)
+; RV32-NEXT:    sw zero, 100(sp)
+; RV32-NEXT:    sw s11, 72(sp)
+; RV32-NEXT:    sw zero, 76(sp)
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw zero, 84(sp)
+; RV32-NEXT:    sw a7, 56(sp)
+; RV32-NEXT:    sw t0, 60(sp)
+; RV32-NEXT:    sw a6, 64(sp)
+; RV32-NEXT:    sw t1, 68(sp)
+; RV32-NEXT:    srli a4, a1, 3
+; RV32-NEXT:    addi a6, sp, 56
+; RV32-NEXT:    andi a7, a1, 31
+; RV32-NEXT:    or t0, s9, s10
+; RV32-NEXT:    srl a3, a3, a5
+; RV32-NEXT:    andi a4, a4, 28
+; RV32-NEXT:    xori a5, a7, 31
+; RV32-NEXT:    snez a7, t0
+; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    add a7, s8, a7
+; RV32-NEXT:    lw a6, 16(a4)
+; RV32-NEXT:    lw t0, 0(a4)
+; RV32-NEXT:    lw t1, 4(a4)
+; RV32-NEXT:    lw s0, 8(a4)
+; RV32-NEXT:    lw a4, 12(a4)
+; RV32-NEXT:    sll a0, a0, a2
+; RV32-NEXT:    or a3, a0, a3
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    slli a0, a4, 1
+; RV32-NEXT:    slli a2, s0, 1
+; RV32-NEXT:    slli s4, t1, 1
+; RV32-NEXT:    sll a6, a6, a5
+; RV32-NEXT:    sll a0, a0, a5
+; RV32-NEXT:    sll s8, a2, a5
+; RV32-NEXT:    sll s4, s4, a5
+; RV32-NEXT:    srl a2, a4, a1
+; RV32-NEXT:    or ra, a2, a6
+; RV32-NEXT:    lw a6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    seqz a4, a6
+; RV32-NEXT:    srl a2, s0, a1
+; RV32-NEXT:    or a2, a2, a0
+; RV32-NEXT:    lw a5, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or a0, a6, a5
+; RV32-NEXT:    sub s5, a5, a4
+; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    srl a0, t1, a1
+; RV32-NEXT:    or a0, a0, s8
+; RV32-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub t1, a5, a4
+; RV32-NEXT:    sw t1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a4, a5, a4
+; RV32-NEXT:    addi a7, a7, 1
+; RV32-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub s6, a5, a4
+; RV32-NEXT:    andi a4, a7, 1
+; RV32-NEXT:    sw a4, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    andi a5, a3, 1
+; RV32-NEXT:    srl a3, t0, a1
+; RV32-NEXT:    or a4, a3, s4
+; RV32-NEXT:    addi a6, a6, -1
+; RV32-NEXT:    sw a6, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s11, 0
+; RV32-NEXT:    li s10, 0
+; RV32-NEXT:    j .LBB3_45
+; RV32-NEXT:  .LBB3_44: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and s0, a5, s0
+; RV32-NEXT:    xor s8, t1, a7
+; RV32-NEXT:    xor s9, a2, s0
+; RV32-NEXT:    or s8, s9, s8
+; RV32-NEXT:    li s9, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:    sltu s4, a2, s0
+; RV32-NEXT:    sub s0, a2, s0
+; RV32-NEXT:    sub a7, t1, a7
+; RV32-NEXT:    srli a2, s2, 31
+; RV32-NEXT:    sub a0, a0, t0
+; RV32-NEXT:    slli t0, t5, 1
+; RV32-NEXT:    or t0, t0, a2
+; RV32-NEXT:    srli a2, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or t1, s2, a2
+; RV32-NEXT:    srli a2, t3, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    or s1, s1, a2
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    lw a2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or t3, a2, t3
+; RV32-NEXT:    srli a2, t5, 31
+; RV32-NEXT:    or s7, s7, a2
+; RV32-NEXT:    sub a2, s0, ra
+; RV32-NEXT:    sltu s0, s0, ra
+; RV32-NEXT:    or t5, a1, t6
+; RV32-NEXT:    sub a7, a7, s4
+; RV32-NEXT:    or s2, t2, t4
+; RV32-NEXT:    sub a0, a0, a6
+; RV32-NEXT:    or a6, a1, t2
+; RV32-NEXT:    or s4, t5, s2
+; RV32-NEXT:    seqz t5, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi a5, a5, 1
+; RV32-NEXT:    sw a5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    seqz a6, a6
+; RV32-NEXT:    sub t2, t2, t5
+; RV32-NEXT:    lw a5, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s1, a5, s1
+; RV32-NEXT:    lw a5, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s2, a5, t1
+; RV32-NEXT:    lw a5, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or t5, a5, t0
+; RV32-NEXT:    andi a5, s7, 1
+; RV32-NEXT:    sub ra, a7, s0
+; RV32-NEXT:    snez a7, s4
+; RV32-NEXT:    sltu t0, t6, a6
+; RV32-NEXT:    sub t6, t6, a6
+; RV32-NEXT:    add a7, s3, a7
+; RV32-NEXT:    sub t4, t4, t0
+; RV32-NEXT:    or a6, a1, t6
+; RV32-NEXT:    addi a7, a7, 1
+; RV32-NEXT:    or t0, t2, t4
+; RV32-NEXT:    andi s3, a7, 1
+; RV32-NEXT:    or a6, a6, t0
+; RV32-NEXT:    or a6, a6, s3
+; RV32-NEXT:    sub a4, a4, a3
+; RV32-NEXT:    sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    beqz a6, .LBB3_56
+; RV32-NEXT:  .LBB3_45: # %udiv-do-while
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    srli a3, a2, 31
+; RV32-NEXT:    slli a6, ra, 1
+; RV32-NEXT:    or t1, a6, a3
+; RV32-NEXT:    srli a3, a0, 31
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    beq s6, t1, .LBB3_47
+; RV32-NEXT:  # %bb.46: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    sltu a3, s6, t1
+; RV32-NEXT:    j .LBB3_48
+; RV32-NEXT:  .LBB3_47: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw a3, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a3, a3, a2
+; RV32-NEXT:  .LBB3_48: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    srli a6, a4, 31
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    or a0, a0, a6
+; RV32-NEXT:    andi a5, a5, 1
+; RV32-NEXT:    or a4, a4, a5
+; RV32-NEXT:    beq s5, a0, .LBB3_50
+; RV32-NEXT:  # %bb.49: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    sltu a5, s5, a0
+; RV32-NEXT:    j .LBB3_51
+; RV32-NEXT:  .LBB3_50: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw a5, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a5, a5, a4
+; RV32-NEXT:  .LBB3_51: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw a6, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    xor a6, a6, a2
+; RV32-NEXT:    xor a7, s6, t1
+; RV32-NEXT:    or a6, a6, a7
+; RV32-NEXT:    beqz a6, .LBB3_53
+; RV32-NEXT:  # %bb.52: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:  .LBB3_53: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    srli a3, ra, 31
+; RV32-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub a3, a6, a3
+; RV32-NEXT:    sub a3, a3, a5
+; RV32-NEXT:    slli a3, a3, 31
+; RV32-NEXT:    srai a5, a3, 31
+; RV32-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a7, a5, a3
+; RV32-NEXT:    lw a3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a3, a5, a3
+; RV32-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and t0, a5, a6
+; RV32-NEXT:    sltu a6, a4, a3
+; RV32-NEXT:    mv ra, a6
+; RV32-NEXT:    beq a0, t0, .LBB3_44
+; RV32-NEXT:  # %bb.54: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    sltu ra, a0, t0
+; RV32-NEXT:    j .LBB3_44
+; RV32-NEXT:  .LBB3_55:
+; RV32-NEXT:    sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s11, 0
+; RV32-NEXT:    li s9, 0
+; RV32-NEXT:    li s10, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:  .LBB3_56: # %udiv-loop-exit
+; RV32-NEXT:    srli a0, s2, 31
+; RV32-NEXT:    slli a1, t5, 1
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    srli a1, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or a2, s2, a1
+; RV32-NEXT:    srli a3, t3, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    srli a4, t5, 31
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    lw a1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or a1, a1, t3
+; RV32-NEXT:    or a3, s11, a3
+; RV32-NEXT:    or a4, s8, a4
+; RV32-NEXT:    or t5, a3, s1
+; RV32-NEXT:    or t3, s9, a2
+; RV32-NEXT:    or a5, s10, a0
+; RV32-NEXT:    andi a2, a4, 1
+; RV32-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:  .LBB3_57: # %udiv-end
+; RV32-NEXT:    sw a1, 0(ra)
+; RV32-NEXT:    sw t5, 4(ra)
+; RV32-NEXT:    sw t3, 8(ra)
+; RV32-NEXT:    sw a5, 12(ra)
+; RV32-NEXT:    andi a2, a2, 1
+; RV32-NEXT:    sb a2, 16(ra)
+; RV32-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 232(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 228(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 224(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 220(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 216(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 212(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 208(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 204(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 200(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 196(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 192(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 240
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i129:
+; RV64:       # %bb.0: # %_udiv-special-cases
+; RV64-NEXT:    ld a3, 0(a2)
+; RV64-NEXT:    ld a4, 8(a2)
+; RV64-NEXT:    ld t1, 16(a2)
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a5, 209715
+; RV64-NEXT:    lui a6, 61681
+; RV64-NEXT:    addi t0, a2, 1365
+; RV64-NEXT:    addi a7, a5, 819
+; RV64-NEXT:    addi a6, a6, -241
+; RV64-NEXT:    slli a2, t0, 32
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    slli t2, a6, 32
+; RV64-NEXT:    add t0, t0, a2
+; RV64-NEXT:    add a7, a7, a5
+; RV64-NEXT:    add a6, a6, t2
+; RV64-NEXT:    srli a2, a4, 1
+; RV64-NEXT:    slli a5, t1, 63
+; RV64-NEXT:    slli t2, a4, 63
+; RV64-NEXT:    or t3, a5, a2
+; RV64-NEXT:    srli a2, a3, 1
+; RV64-NEXT:    or t4, a2, t2
+; RV64-NEXT:    bnez t3, .LBB3_2
+; RV64-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV64-NEXT:    srli a2, t4, 1
+; RV64-NEXT:    or a2, t4, a2
+; RV64-NEXT:    srli a5, a2, 2
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 8
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 16
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 32
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    and a5, a5, t0
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    and a5, a2, a7
+; RV64-NEXT:    srli a2, a2, 2
+; RV64-NEXT:    and a2, a2, a7
+; RV64-NEXT:    add a2, a5, a2
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    and a2, a2, a6
+; RV64-NEXT:    slli a5, a2, 8
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 16
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    srli a2, a2, 56
+; RV64-NEXT:    addi t2, a2, 64
+; RV64-NEXT:    j .LBB3_3
+; RV64-NEXT:  .LBB3_2:
+; RV64-NEXT:    srli a2, t3, 1
+; RV64-NEXT:    or a2, t3, a2
+; RV64-NEXT:    srli a5, a2, 2
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 8
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 16
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 32
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    and a5, a5, t0
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    and a5, a2, a7
+; RV64-NEXT:    srli a2, a2, 2
+; RV64-NEXT:    and a2, a2, a7
+; RV64-NEXT:    add a2, a5, a2
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    and a2, a2, a6
+; RV64-NEXT:    slli a5, a2, 8
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 16
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    srli t2, a2, 56
+; RV64-NEXT:  .LBB3_3: # %_udiv-special-cases
+; RV64-NEXT:    addi sp, sp, -192
+; RV64-NEXT:    sd s0, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 176(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 168(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 160(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 152(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 144(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT:    slli a2, a3, 63
+; RV64-NEXT:    li t5, 128
+; RV64-NEXT:    bnez a2, .LBB3_5
+; RV64-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV64-NEXT:    li s0, 128
+; RV64-NEXT:    j .LBB3_6
+; RV64-NEXT:  .LBB3_5:
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 2
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 8
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 16
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 32
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    and a5, a5, t0
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    and a5, a2, a7
+; RV64-NEXT:    srli a2, a2, 2
+; RV64-NEXT:    and a2, a2, a7
+; RV64-NEXT:    add a2, a5, a2
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    and a2, a2, a6
+; RV64-NEXT:    slli a5, a2, 8
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 16
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    srli s0, a2, 56
+; RV64-NEXT:  .LBB3_6: # %_udiv-special-cases
+; RV64-NEXT:    ld a5, 0(a1)
+; RV64-NEXT:    ld a2, 8(a1)
+; RV64-NEXT:    ld s2, 16(a1)
+; RV64-NEXT:    or a1, t4, t3
+; RV64-NEXT:    addi s1, s0, 128
+; RV64-NEXT:    bnez a1, .LBB3_8
+; RV64-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV64-NEXT:    mv t2, s1
+; RV64-NEXT:  .LBB3_8: # %_udiv-special-cases
+; RV64-NEXT:    snez s3, a1
+; RV64-NEXT:    srli a1, a2, 1
+; RV64-NEXT:    slli t3, s2, 63
+; RV64-NEXT:    slli t4, a2, 63
+; RV64-NEXT:    or a1, t3, a1
+; RV64-NEXT:    srli t3, a5, 1
+; RV64-NEXT:    or t6, t3, t4
+; RV64-NEXT:    bnez a1, .LBB3_10
+; RV64-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV64-NEXT:    srli t3, t6, 1
+; RV64-NEXT:    or t3, t6, t3
+; RV64-NEXT:    srli t4, t3, 2
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 8
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 16
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 32
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    not t3, t3
+; RV64-NEXT:    srli t4, t3, 1
+; RV64-NEXT:    and t4, t4, t0
+; RV64-NEXT:    sub t3, t3, t4
+; RV64-NEXT:    and t4, t3, a7
+; RV64-NEXT:    srli t3, t3, 2
+; RV64-NEXT:    and t3, t3, a7
+; RV64-NEXT:    add t3, t4, t3
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    and t3, t3, a6
+; RV64-NEXT:    slli t4, t3, 8
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 16
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 32
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    srli t3, t3, 56
+; RV64-NEXT:    addi s4, t3, 64
+; RV64-NEXT:    j .LBB3_11
+; RV64-NEXT:  .LBB3_10:
+; RV64-NEXT:    srli t3, a1, 1
+; RV64-NEXT:    or t3, a1, t3
+; RV64-NEXT:    srli t4, t3, 2
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 8
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 16
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 32
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    not t3, t3
+; RV64-NEXT:    srli t4, t3, 1
+; RV64-NEXT:    and t4, t4, t0
+; RV64-NEXT:    sub t3, t3, t4
+; RV64-NEXT:    and t4, t3, a7
+; RV64-NEXT:    srli t3, t3, 2
+; RV64-NEXT:    and t3, t3, a7
+; RV64-NEXT:    add t3, t4, t3
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    and t3, t3, a6
+; RV64-NEXT:    slli t4, t3, 8
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 16
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 32
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    srli s4, t3, 56
+; RV64-NEXT:  .LBB3_11: # %_udiv-special-cases
+; RV64-NEXT:    andi t4, s2, 1
+; RV64-NEXT:    andi t1, t1, 1
+; RV64-NEXT:    or t3, a3, a4
+; RV64-NEXT:    or s2, a5, a2
+; RV64-NEXT:    sltu s0, s1, s0
+; RV64-NEXT:    slli s1, a5, 63
+; RV64-NEXT:    addi s3, s3, -1
+; RV64-NEXT:    beqz s1, .LBB3_13
+; RV64-NEXT:  # %bb.12:
+; RV64-NEXT:    srli t5, s1, 1
+; RV64-NEXT:    or t5, s1, t5
+; RV64-NEXT:    srli s1, t5, 2
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 4
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 8
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 16
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 32
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    not t5, t5
+; RV64-NEXT:    srli s1, t5, 1
+; RV64-NEXT:    and t0, s1, t0
+; RV64-NEXT:    sub t0, t5, t0
+; RV64-NEXT:    and t5, t0, a7
+; RV64-NEXT:    srli t0, t0, 2
+; RV64-NEXT:    and a7, t0, a7
+; RV64-NEXT:    add a7, t5, a7
+; RV64-NEXT:    srli t0, a7, 4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    slli a7, a6, 8
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    slli a7, a6, 16
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    srli t5, a6, 56
+; RV64-NEXT:  .LBB3_13: # %_udiv-special-cases
+; RV64-NEXT:    or t0, t3, t1
+; RV64-NEXT:    or a6, s2, t4
+; RV64-NEXT:    and a7, s3, s0
+; RV64-NEXT:    or t6, t6, a1
+; RV64-NEXT:    addi s0, t5, 128
+; RV64-NEXT:    bnez t6, .LBB3_15
+; RV64-NEXT:  # %bb.14: # %_udiv-special-cases
+; RV64-NEXT:    mv s4, s0
+; RV64-NEXT:  .LBB3_15: # %_udiv-special-cases
+; RV64-NEXT:    seqz a1, t0
+; RV64-NEXT:    sltu t0, s0, t5
+; RV64-NEXT:    snez t5, t6
+; RV64-NEXT:    addi t5, t5, -1
+; RV64-NEXT:    and t0, t5, t0
+; RV64-NEXT:    sltu t5, t2, s4
+; RV64-NEXT:    seqz a6, a6
+; RV64-NEXT:    mv t6, t5
+; RV64-NEXT:    beq a7, t0, .LBB3_17
+; RV64-NEXT:  # %bb.16: # %_udiv-special-cases
+; RV64-NEXT:    sltu t6, a7, t0
+; RV64-NEXT:  .LBB3_17: # %_udiv-special-cases
+; RV64-NEXT:    or a1, a1, a6
+; RV64-NEXT:    andi a6, t6, 1
+; RV64-NEXT:    sub a7, a7, t0
+; RV64-NEXT:    sub t5, a7, t5
+; RV64-NEXT:    sub a7, t2, s4
+; RV64-NEXT:    beqz a6, .LBB3_19
+; RV64-NEXT:  # %bb.18: # %_udiv-special-cases
+; RV64-NEXT:    mv t0, a6
+; RV64-NEXT:    j .LBB3_20
+; RV64-NEXT:  .LBB3_19:
+; RV64-NEXT:    sltiu t0, a7, 129
+; RV64-NEXT:    xori t0, t0, 1
+; RV64-NEXT:    snez t2, t5
+; RV64-NEXT:    or t0, t0, t2
+; RV64-NEXT:  .LBB3_20: # %_udiv-special-cases
+; RV64-NEXT:    or t6, a1, t0
+; RV64-NEXT:    addi a1, t6, -1
+; RV64-NEXT:    and t2, t4, a1
+; RV64-NEXT:    and t0, a1, a2
+; RV64-NEXT:    and a1, a1, a5
+; RV64-NEXT:    bnez t6, .LBB3_30
+; RV64-NEXT:  # %bb.21: # %_udiv-special-cases
+; RV64-NEXT:    xori t6, a7, 128
+; RV64-NEXT:    or t6, t6, a6
+; RV64-NEXT:    or t6, t6, t5
+; RV64-NEXT:    beqz t6, .LBB3_30
+; RV64-NEXT:  # %bb.22: # %udiv-bb1
+; RV64-NEXT:    addi a1, a7, 1
+; RV64-NEXT:    sd zero, 64(sp)
+; RV64-NEXT:    sd zero, 72(sp)
+; RV64-NEXT:    sd zero, 80(sp)
+; RV64-NEXT:    sd zero, 88(sp)
+; RV64-NEXT:    sd a5, 96(sp)
+; RV64-NEXT:    sd a2, 104(sp)
+; RV64-NEXT:    sd t4, 112(sp)
+; RV64-NEXT:    li t0, 128
+; RV64-NEXT:    addi t2, sp, 96
+; RV64-NEXT:    neg s1, a7
+; RV64-NEXT:    seqz t6, a1
+; RV64-NEXT:    sub a7, t0, a7
+; RV64-NEXT:    add t5, t5, t6
+; RV64-NEXT:    andi t0, a7, 63
+; RV64-NEXT:    srli a7, a7, 3
+; RV64-NEXT:    or t6, a1, t5
+; RV64-NEXT:    xori s2, t0, 63
+; RV64-NEXT:    andi a7, a7, 24
+; RV64-NEXT:    seqz t0, t6
+; RV64-NEXT:    sub s3, t2, a7
+; RV64-NEXT:    add a6, a6, t0
+; RV64-NEXT:    ld t2, 0(s3)
+; RV64-NEXT:    ld s4, 8(s3)
+; RV64-NEXT:    andi a7, a6, 1
+; RV64-NEXT:    or t6, t6, a7
+; RV64-NEXT:    srli a6, t2, 1
+; RV64-NEXT:    sll t0, s4, s1
+; RV64-NEXT:    srl a6, a6, s2
+; RV64-NEXT:    or t0, t0, a6
+; RV64-NEXT:    sll a6, t2, s1
+; RV64-NEXT:    li t2, 0
+; RV64-NEXT:    beqz t6, .LBB3_28
+; RV64-NEXT:  # %bb.23: # %udiv-preheader
+; RV64-NEXT:    li t6, 0
+; RV64-NEXT:    li s0, 0
+; RV64-NEXT:    srli s4, s4, 1
+; RV64-NEXT:    ld s3, 16(s3)
+; RV64-NEXT:    sd zero, 32(sp)
+; RV64-NEXT:    sd zero, 40(sp)
+; RV64-NEXT:    sd zero, 48(sp)
+; RV64-NEXT:    sd zero, 56(sp)
+; RV64-NEXT:    sd a5, 0(sp)
+; RV64-NEXT:    sd a2, 8(sp)
+; RV64-NEXT:    sd t4, 16(sp)
+; RV64-NEXT:    sd zero, 24(sp)
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srl a5, s4, s2
+; RV64-NEXT:    mv t4, sp
+; RV64-NEXT:    snez t3, t3
+; RV64-NEXT:    andi a2, a2, 24
+; RV64-NEXT:    add t1, t1, t3
+; RV64-NEXT:    add a2, t4, a2
+; RV64-NEXT:    ld t3, 0(a2)
+; RV64-NEXT:    ld t4, 8(a2)
+; RV64-NEXT:    ld a2, 16(a2)
+; RV64-NEXT:    sll s1, s3, s1
+; RV64-NEXT:    andi s2, a1, 63
+; RV64-NEXT:    xori s2, s2, 63
+; RV64-NEXT:    or s3, s1, a5
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    slli a5, t4, 1
+; RV64-NEXT:    sll a2, a2, s2
+; RV64-NEXT:    sll s2, a5, s2
+; RV64-NEXT:    srl s1, t4, a1
+; RV64-NEXT:    or s1, s1, a2
+; RV64-NEXT:    seqz a2, a3
+; RV64-NEXT:    sub a2, a4, a2
+; RV64-NEXT:    addi a5, t1, 1
+; RV64-NEXT:    andi a5, a5, 1
+; RV64-NEXT:    andi s3, s3, 1
+; RV64-NEXT:    srl t1, t3, a1
+; RV64-NEXT:    or s2, t1, s2
+; RV64-NEXT:    addi t1, a3, -1
+; RV64-NEXT:    j .LBB3_26
+; RV64-NEXT:  .LBB3_24: # %udiv-do-while
+; RV64-NEXT:    # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT:    sltu t3, a2, s4
+; RV64-NEXT:  .LBB3_25: # %udiv-do-while
+; RV64-NEXT:    # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT:    srli s1, s1, 63
+; RV64-NEXT:    sub t4, a5, s1
+; RV64-NEXT:    sub t3, t4, t3
+; RV64-NEXT:    slli t3, t3, 63
+; RV64-NEXT:    srai s1, t3, 63
+; RV64-NEXT:    and s3, s1, a4
+; RV64-NEXT:    li t3, 0
+; RV64-NEXT:    li t4, 0
+; RV64-NEXT:    srli s5, a6, 63
+; RV64-NEXT:    sub s4, s4, s3
+; RV64-NEXT:    slli s3, t0, 1
+; RV64-NEXT:    or s3, s3, s5
+; RV64-NEXT:    srli t0, t0, 63
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    or a6, t2, a6
+; RV64-NEXT:    seqz t2, a1
+; RV64-NEXT:    or s0, s0, t0
+; RV64-NEXT:    or s5, a1, t5
+; RV64-NEXT:    sub t5, t5, t2
+; RV64-NEXT:    and s6, s1, a3
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    andi t2, s1, 1
+; RV64-NEXT:    or t0, t6, s3
+; RV64-NEXT:    sltu t6, s2, s6
+; RV64-NEXT:    snez s5, s5
+; RV64-NEXT:    andi s3, s0, 1
+; RV64-NEXT:    sub s1, s4, t6
+; RV64-NEXT:    add a7, a7, s5
+; RV64-NEXT:    addi a7, a7, 1
+; RV64-NEXT:    andi a7, a7, 1
+; RV64-NEXT:    or t6, a1, t5
+; RV64-NEXT:    or s4, t6, a7
+; RV64-NEXT:    sub s2, s2, s6
+; RV64-NEXT:    li t6, 0
+; RV64-NEXT:    li s0, 0
+; RV64-NEXT:    beqz s4, .LBB3_29
+; RV64-NEXT:  .LBB3_26: # %udiv-do-while
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    srli t3, s2, 63
+; RV64-NEXT:    slli t4, s1, 1
+; RV64-NEXT:    slli s2, s2, 1
+; RV64-NEXT:    or s4, t4, t3
+; RV64-NEXT:    andi t3, s3, 1
+; RV64-NEXT:    or s2, s2, t3
+; RV64-NEXT:    bne a2, s4, .LBB3_24
+; RV64-NEXT:  # %bb.27: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT:    sltu t3, t1, s2
+; RV64-NEXT:    j .LBB3_25
+; RV64-NEXT:  .LBB3_28:
+; RV64-NEXT:    li t3, 0
+; RV64-NEXT:    li t4, 0
+; RV64-NEXT:  .LBB3_29: # %udiv-loop-exit
+; RV64-NEXT:    srli a2, a6, 63
+; RV64-NEXT:    slli a3, t0, 1
+; RV64-NEXT:    srli a4, t0, 63
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    or a1, t2, a6
+; RV64-NEXT:    or a2, t3, a2
+; RV64-NEXT:    or a4, t4, a4
+; RV64-NEXT:    or t0, a2, a3
+; RV64-NEXT:    andi t2, a4, 1
+; RV64-NEXT:  .LBB3_30: # %udiv-end
+; RV64-NEXT:    andi a2, t2, 1
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    sd t0, 8(a0)
+; RV64-NEXT:    sb a2, 16(a0)
+; RV64-NEXT:    ld s0, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 168(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 160(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 152(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 144(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 192
+; RV64-NEXT:    ret
   %res = udiv i129 %x, %y
   ret i129 %res
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
new file mode 100644
index 0000000..9716cbe
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s
+
+target triple = "wasm32"
+; relaxed_dot stands for relaxed_dot_i8x16_i7x16_s, as in td
+; relaxed_dot_add stands for i32x4.relaxed_dot_i8x16_i7x16_add_s, as in td
+
+define <8 x i16> @relaxed_dot_sext_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_1:
+; CHECK:         .functype relaxed_dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT:    return $pop0
+  %sext1 = sext <16 x i8> %a to <16 x i16>
+  %sext2 = sext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %sext1, %sext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+}
+
+
+define <8 x i16> @relaxed_dot_sext_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_2:
+; CHECK:         .functype relaxed_dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT:    return $pop0
+  %sext1 = sext <16 x i8> %a to <16 x i16>
+  %sext2 = sext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %sext1, %sext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle2, %shuffle1
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @relaxed_dot_sext_self(<16 x i8> %v) {
+; CHECK-LABEL: relaxed_dot_sext_self:
+; CHECK:         .functype relaxed_dot_sext_self (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $0
+; CHECK-NEXT:    return $pop0
+  %sext = sext <16 x i8> %v to <16 x i16>
+  %mul = mul <16 x i16> %sext, %sext
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @relaxed_dot_add_from_relaxed_dot(<16 x i8> %a, <16 x i8> %b, <4 x i32> %c) {
+; CHECK-LABEL: relaxed_dot_add_from_relaxed_dot:
+; CHECK:         .functype relaxed_dot_add_from_relaxed_dot (v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32x4.relaxed_dot_i8x16_i7x16_add_s $push0=, $0, $1, $2
+; CHECK-NEXT:    return $pop0
+  %relaxed_dot_call = call <8 x i16> @llvm.wasm.relaxed.dot.i8x16.i7x16.signed(<16 x i8> %a, <16 x i8> %b)
+  %sext = call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> %relaxed_dot_call)
+  %res = add <4 x i32> %sext, %c
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_zext(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_zext:
+; CHECK:         .functype relaxed_dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.extmul_low_i8x16_u $push6=, $0, $1
+; CHECK-NEXT:    local.tee $push5=, $2=, $pop6
+; CHECK-NEXT:    i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; CHECK-NEXT:    local.tee $push3=, $1=, $pop4
+; CHECK-NEXT:    i8x16.shuffle $push1=, $pop5, $pop3, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT:    i8x16.shuffle $push0=, $2, $1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+; CHECK-NEXT:    i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
+  %zext1 = zext <16 x i8> %a to <16 x i16>
+  %zext2 = zext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %zext1, %zext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_wrong_shuffle(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_wrong_shuffle:
+; CHECK:         .functype relaxed_dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.extmul_low_i8x16_s $push1=, $0, $1
+; CHECK-NEXT:    i16x8.extmul_high_i8x16_s $push0=, $0, $1
+; CHECK-NEXT:    i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
+  %sext1 = sext <16 x i8> %a to <16 x i16>
+  %sext2 = sext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %sext1, %sext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/X86/cpus-intel.ll b/llvm/test/CodeGen/X86/cpus-intel.ll
index 71253c8..646629d 100644
--- a/llvm/test/CodeGen/X86/cpus-intel.ll
+++ b/llvm/test/CodeGen/X86/cpus-intel.ll
@@ -39,6 +39,7 @@
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 
@@ -106,6 +107,7 @@
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 
diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index c2b7068..df04b67 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86,X86-SDAGISEL
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
 ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
 ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
-; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X86,X86-GISEL
-; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X64-GISEL
+; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2  | FileCheck %s -check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2  | FileCheck %s -check-prefixes=X64,X64-GISEL
 
 define i1 @isnone_f(float %x) nounwind {
 ; X86-LABEL: isnone_f:
@@ -23,11 +23,6 @@ define i1 @isnone_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: isnone_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    xorl %eax, %eax
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
   ret i1 %0
@@ -50,27 +45,22 @@ define i1 @isany_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: isany_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movb $1, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
   ret i1 %0
 }
 
 define i1 @issignaling_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: issignaling_f:
-; X86-SDAGISEL:       # %bb.0:
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setl %cl
-; X86-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    andb %cl, %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: issignaling_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: issignaling_f:
 ; X64:       # %bb.0:
@@ -97,44 +87,18 @@ define i1 @issignaling_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    andb %cl, %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: issignaling_f:
-; X86-GISEL:       # %bb.0:
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    seta %dl
-; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-GISEL-NEXT:    setb %al
-; X86-GISEL-NEXT:    andb %dl, %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: issignaling_f:
-; X64-GISEL:       # %bb.0:
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    seta %dl
-; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-GISEL-NEXT:    setb %al
-; X64-GISEL-NEXT:    andb %dl, %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
    %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1)  ; "snan"
    ret i1 %a0
 }
 
  define i1 @isquiet_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isquiet_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isquiet_f:
 ; X64:       # %bb.0: # %entry
@@ -155,39 +119,19 @@ define i1 @issignaling_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setge %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: isquiet_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-GISEL-NEXT:    setae %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: isquiet_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-GISEL-NEXT:    setae %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
  entry:
    %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2)  ; "qnan"
    ret i1 %0
 }
 
 define i1 @not_isquiet_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isquiet_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setl %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_isquiet_f:
 ; X64:       # %bb.0: # %entry
@@ -208,57 +152,19 @@ define i1 @not_isquiet_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setl %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: not_isquiet_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    setb %dl
-; X86-GISEL-NEXT:    orb %cl, %dl
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    sete %cl
-; X86-GISEL-NEXT:    orb %dl, %cl
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    seta %dl
-; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-GISEL-NEXT:    setb %al
-; X86-GISEL-NEXT:    andb %dl, %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: not_isquiet_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    setb %dl
-; X64-GISEL-NEXT:    orb %cl, %dl
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    sete %cl
-; X64-GISEL-NEXT:    orb %dl, %cl
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    seta %dl
-; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-GISEL-NEXT:    setb %al
-; X64-GISEL-NEXT:    andb %dl, %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021)  ; ~"qnan"
   ret i1 %0
 }
 
 define i1 @isinf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isinf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isinf_f:
 ; X64:       # %bb.0: # %entry
@@ -279,39 +185,19 @@ define i1 @isinf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: isinf_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    sete %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: isinf_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    sete %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
   ret i1 %0
 }
 
 define i1 @not_isinf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isinf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setne %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_isinf_f:
 ; X64:       # %bb.0: # %entry
@@ -332,43 +218,17 @@ define i1 @not_isinf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setne %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: not_isinf_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    setb %dl
-; X86-GISEL-NEXT:    orb %cl, %dl
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    seta %al
-; X86-GISEL-NEXT:    orb %dl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: not_isinf_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    setb %dl
-; X64-GISEL-NEXT:    orb %cl, %dl
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    seta %al
-; X64-GISEL-NEXT:    orb %dl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; ~0x204 = "~inf"
   ret i1 %0
 }
 
 define i1 @is_plus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_plus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: is_plus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_plus_inf_f:
 ; X64:       # %bb.0: # %entry
@@ -386,34 +246,17 @@ define i1 @is_plus_inf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: is_plus_inf_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-GISEL-NEXT:    sete %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: is_plus_inf_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    sete %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512)  ; 0x200 = "+inf"
   ret i1 %0
 }
 
 define i1 @is_minus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_minus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_minus_inf_f:
 ; X64:       # %bb.0: # %entry
@@ -431,34 +274,17 @@ define i1 @is_minus_inf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: is_minus_inf_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-GISEL-NEXT:    sete %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: is_minus_inf_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-GISEL-NEXT:    sete %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4)  ; "-inf"
   ret i1 %0
 }
 
 define i1 @not_is_minus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT:    setne %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_is_minus_inf_f:
 ; X64:       # %bb.0: # %entry
@@ -476,55 +302,19 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setne %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: not_is_minus_inf_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    pushl %ebx
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    movl %eax, %ecx
-; X86-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %edx, %edx
-; X86-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; X86-GISEL-NEXT:    setb %bl
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    sete %ah
-; X86-GISEL-NEXT:    orb %dl, %ah
-; X86-GISEL-NEXT:    orb %bl, %ah
-; X86-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; X86-GISEL-NEXT:    seta %al
-; X86-GISEL-NEXT:    orb %ah, %al
-; X86-GISEL-NEXT:    popl %ebx
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: not_is_minus_inf_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    movl %eax, %ecx
-; X64-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %edx, %edx
-; X64-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; X64-GISEL-NEXT:    setb %sil
-; X64-GISEL-NEXT:    orb %dl, %sil
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    sete %dl
-; X64-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; X64-GISEL-NEXT:    seta %al
-; X64-GISEL-NEXT:    orb %dl, %al
-; X64-GISEL-NEXT:    orb %sil, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019)  ; ~"-inf"
   ret i1 %0
 }
 
 define i1 @isfinite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isfinite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setl %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isfinite_f:
 ; X64:       # %bb.0: # %entry
@@ -545,39 +335,19 @@ define i1 @isfinite_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setl %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: isfinite_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    setb %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: isfinite_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    setb %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %0
 }
 
 define i1 @not_isfinite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isfinite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_isfinite_f:
 ; X64:       # %bb.0: # %entry
@@ -598,43 +368,17 @@ define i1 @not_isfinite_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setge %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: not_isfinite_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    sete %dl
-; X86-GISEL-NEXT:    orb %cl, %dl
-; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-GISEL-NEXT:    seta %al
-; X86-GISEL-NEXT:    orb %dl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: not_isfinite_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    sete %dl
-; X64-GISEL-NEXT:    orb %cl, %dl
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    seta %al
-; X64-GISEL-NEXT:    orb %dl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; ~0x1f8 = "~finite"
   ret i1 %0
 }
 
 define i1 @is_plus_finite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_plus_finite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setb %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: is_plus_finite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_plus_finite_f:
 ; X64:       # %bb.0: # %entry
@@ -652,23 +396,6 @@ define i1 @is_plus_finite_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setb %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: is_plus_finite_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    xorl %ecx, %ecx
-; X86-GISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-GISEL-NEXT:    setb %al
-; X86-GISEL-NEXT:    orb %cl, %al
-; X86-GISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: is_plus_finite_f:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    xorl %ecx, %ecx
-; X64-GISEL-NEXT:    movd %xmm0, %eax
-; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-GISEL-NEXT:    setb %al
-; X64-GISEL-NEXT:    orb %cl, %al
-; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448)  ; 0x1c0 = "+finite"
   ret i1 %0
@@ -691,11 +418,6 @@ define i1 @isnone_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: isnone_d:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    xorl %eax, %eax
-; X64-GISEL-NEXT:    retq
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0)
     ret i1 %0
@@ -718,11 +440,6 @@ define i1 @isany_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
-;
-; X64-GISEL-LABEL: isany_d:
-; X64-GISEL:       # %bb.0: # %entry
-; X64-GISEL-NEXT:    movb $1, %al
-; X64-GISEL-NEXT:    retq
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023)
     ret i1 %0
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index d752659..04f0a65 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-NOVBMI
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI
 ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
@@ -883,6 +883,30 @@ define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal
 }
 
 define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: mul256:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa (%rsi), %ymm2
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa 32(%rsi), %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT:    vpand %ymm4, %ymm3, %ymm5
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm5, %ymm1, %ymm5
+; CHECK-SKX-NOVBMI-NEXT:    vpandn %ymm3, %ymm4, %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpsllw $8, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT:    vpand %ymm4, %ymm2, %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpandn %ymm2, %ymm4, %ymm2
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsllw $8, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa %ymm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; CHECK-SKX-NOVBMI-NEXT:    vzeroupper
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: mul256:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa (%rdi), %ymm0
@@ -960,6 +984,21 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 }
 
 define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" {
+; CHECK-SKX-NOVBMI-LABEL: mul512:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa64 (%rsi), %zmm1
+; CHECK-SKX-NOVBMI-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %zmm3, %zmm0, %zmm3
+; CHECK-SKX-NOVBMI-NEXT:    vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsllw $8, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT:    vzeroupper
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: mul512:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -1137,6 +1176,14 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector
 }
 
 define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_zeroes:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa (%rdi), %ymm1
@@ -1192,6 +1239,14 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w
 }
 
 define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_sign:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa (%rdi), %ymm1
diff --git a/llvm/test/DebugInfo/AArch64/callsite.mir b/llvm/test/DebugInfo/AArch64/callsite.mir
new file mode 100644
index 0000000..e3bd764
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/callsite.mir
@@ -0,0 +1,68 @@
+# This test should not crash when generating call-site information. 
+# It was created to make sure that if isCopyLikeInstr in TargetInstrInfo.h 
+# returns an undef Dest Reg or Src Reg, we don't try to get a SubReg for it.
+
+# RUN: llc --mtriple=arm64e-apple-ios -start-before=aarch64-asm-printer %s -filetype=obj -o /dev/null --emit-call-site-info 
+--- |
+  %struct.rtyuio = type { i8 }
+  define noundef i32 @aserty(ptr noundef %0, ptr noundef %1) local_unnamed_addr #0 !dbg !23 {
+    ret i32 0
+  }
+  define void @asdfgh(ptr noundef %0, ptr noundef %1, i8 noundef zeroext %2) local_unnamed_addr #0 !dbg !53 {
+    %4 = alloca ptr
+    %5 = call ptr @llvm.stackguard()
+    %6 = alloca %struct.rtyuio
+    %7 = icmp eq ptr %1, null
+    br i1 %7, label %10, label %8
+    %9 = tail call i8 @polkiokl(ptr noundef %0) #6
+    br label %10
+    ret void
+  }
+  declare i8 @polkiokl(ptr noundef) local_unnamed_addr #2
+  !llvm.module.flags = !{!2, !8}
+  !llvm.dbg.cu = !{!9}
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !8 = !{i32 7, !"frame-pointer", i32 1}
+  !9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_11, file: !10, emissionKind: FullDebug, sysroot: "/")
+  !10 = !DIFile(filename: "a.cpp", directory: "/")
+  !23 = distinct !DISubprogram(type: !27, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, unit: !9, retainedNodes: !46)
+  !24 = distinct !DICompositeType(tag: DW_TAG_class_type, identifier: "yshscbshhdvcm")
+  !27 = !DISubroutineType(types: !28)
+  !28 = !{}
+  !30 = !DIDerivedType(tag: DW_TAG_typedef, baseType: !33)
+  !33 = distinct !DICompositeType(tag: DW_TAG_structure_type, identifier: "tyruwyeuiwiybabd")
+  !36 = !DISubroutineType(types: !37)
+  !37 = !{}
+  !46 = !{}
+  !47 = !DILocalVariable(scope: !23, type: !48, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !48 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !24, size: 64)
+  !49 = !DILocalVariable(scope: !23, type: !30)
+  !50 = !DILocation(scope: !23)
+  !51 = !DILocation(scope: !23)
+  !53 = distinct !DISubprogram(type: !36, unit: !9, retainedNodes: !54)
+  !54 = !{}
+name:            aserty
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:
+  - { bb: 0, offset: 9, fwdArgRegs: 
+      - { arg: 2, reg: '$w2' } }
+body:             |
+  bb.0 (%ir-block.2):
+    DBG_VALUE $x0, $noreg, !47, !DIExpression(),  debug-location !50
+    DBG_VALUE $x1, $noreg, !49, !DIExpression(),  debug-location !50
+    frame-setup PACIBSP implicit-def $lr, implicit killed $lr, implicit $sp
+    early-clobber $sp = frame-setup STPXpre $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.1), (store (s64) into %stack.0)
+    $fp = frame-setup ADDXri $sp, 0, 0
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8
+    frame-setup CFI_INSTRUCTION offset $w29, -16
+    $x2 = ORRXrs $xzr, undef $noreg, 0, implicit $wzr,  debug-location !51
+    BL @asdfgh, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit killed $w2, implicit-def $sp,  debug-location !51
+...
+name:            asdfgh
+body:             |
+  bb.2 (%ir-block.10):
diff --git a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
index 3ed68e8..c3a75f6 100644
--- a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
@@ -13,14 +13,14 @@
 ; RUN:	-r=%t.o,main,plx \
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
-; DUMP: Callsite Context Graph:
+; RUN:	-print-before=memprof-context-disambiguation \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
 
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s --check-prefix=IR
 ; IR: !memprof {{.*}} !callsite
 ; IR: "memprof"="cold"
 
+; DUMP: Callsite Context Graph:
+
 ;; Next check without -supports-hot-cold-new, we should not perform
 ;; context disambiguation, and we should strip memprof metadata and
 ;; attributes before optimization.
@@ -28,13 +28,16 @@
 ; RUN:	-r=%t.o,main,plx \
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
+; RUN:	-print-before=memprof-context-disambiguation \
 ; RUN:	-o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN:  --implicit-check-not "Callsite Context Graph:"
+; RUN:  --implicit-check-not "Callsite Context Graph:" \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
 
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+;; Ensure the attributes and metadata are stripped when running a non-LTO pipeline.
+; RUN: opt -O3 %t.o -S | FileCheck %s \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
 
 source_filename = "memprof-supports-hot-cold-new.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
index 4ec534f..fa40fe6 100644
--- a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
@@ -17,6 +17,10 @@
 # INTEL: pushp	r16
 0xd5,0x18,0x50
 
+# ATT:   pushq	%r16
+# INTEL: push	r16
+0xd5,0x10,0x50
+
 # ATT:   popp	%rax
 # INTEL: popp	rax
 0xd5,0x08,0x58
@@ -32,3 +36,7 @@
 # ATT:   popp	%r16
 # INTEL: popp	r16
 0xd5,0x18,0x58
+
+# ATT:   popq	%r16
+# INTEL: pop	r16
+0xd5,0x10,0x58
diff --git a/llvm/test/MC/X86/apx/pushp-popp-att.s b/llvm/test/MC/X86/apx/pushp-popp-att.s
index a810744..d638034 100644
--- a/llvm/test/MC/X86/apx/pushp-popp-att.s
+++ b/llvm/test/MC/X86/apx/pushp-popp-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-8: error:
+# ERROR-COUNT-10: error:
 # ERROR-NOT: error:
 
 # CHECK: pushp	%rax
@@ -16,6 +16,9 @@
 # CHECK: pushp	%r16
 # CHECK: encoding: [0xd5,0x18,0x50]
          pushp	%r16
+# CHECK: pushq	%r16
+# CHECK: encoding: [0xd5,0x10,0x50]
+         pushq	%r16
 
 # CHECK: popp	%rax
 # CHECK: encoding: [0xd5,0x08,0x58]
@@ -29,3 +32,6 @@
 # CHECK: popp	%r16
 # CHECK: encoding: [0xd5,0x18,0x58]
          popp	%r16
+# CHECK: popq	%r16
+# CHECK: encoding: [0xd5,0x10,0x58]
+         popq	%r16
diff --git a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
index 7a4d860..fe2a002 100644
--- a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
@@ -17,11 +17,12 @@
 ; RUN:	-r=%t/foo.o,foo,plx \
 ; RUN:	-r=%t/foo.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:	-print-before=memprof-context-disambiguation \
+; RUN:	-thinlto-threads=1 \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
+
 ; DUMP: Callsite Context Graph:
 
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s --check-prefix=IR
 ; IR: @main()
 ; IR: !memprof {{.*}} !callsite
 ; IR: @_Znam(i64 0) #[[ATTR:[0-9]+]]
@@ -41,13 +42,12 @@
 ; RUN:	-r=%t/foo.o,foo,plx \
 ; RUN:	-r=%t/foo.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
+; RUN:	-print-before=memprof-context-disambiguation \
+; RUN:	-thinlto-threads=1 \
 ; RUN:	-o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN:  --implicit-check-not "Callsite Context Graph:"
-
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+; RUN:  --implicit-check-not "Callsite Context Graph:" \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
 
 ;--- main.ll
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/GVN/PRE/pre-load.ll b/llvm/test/Transforms/GVN/PRE/pre-load.ll
index 5a07f9f..afa1354 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-load.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-load.ll
@@ -1503,3 +1503,51 @@ wrong:
 exit:
   ret void
 }
+
+; Allow the load to be made available on the edge (%entry, %if.end) as part of PRE,
+; but ensure `%identical.l` is not hoisted to its predecessor due to the local
+; dependency with the call.
+
+define i32 @test24(ptr noalias %p, ptr noalias %q, i1 %c) {
+; MDEP-LABEL: @test24(
+; MDEP-NEXT:  entry:
+; MDEP-NEXT:    br i1 [[C:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]]
+; MDEP:       entry.if.end_crit_edge:
+; MDEP-NEXT:    [[VV_PRE:%.*]] = load i32, ptr [[X:%.*]], align 4
+; MDEP-NEXT:    br label [[IF_END:%.*]]
+; MDEP:       if.then:
+; MDEP-NEXT:    call void @opaque(ptr [[X]])
+; MDEP-NEXT:    [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MDEP-NEXT:    store i32 [[UU]], ptr [[R:%.*]], align 4
+; MDEP-NEXT:    br label [[IF_END]]
+; MDEP:       if.end:
+; MDEP-NEXT:    [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ]
+; MDEP-NEXT:    ret i32 [[VV]]
+;
+; MSSA-LABEL: @test24(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    br i1 [[C:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; MSSA:       if.then:
+; MSSA-NEXT:    call void @opaque(ptr [[X:%.*]])
+; MSSA-NEXT:    [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT:    store i32 [[UU]], ptr [[R:%.*]], align 4
+; MSSA-NEXT:    br label [[IF_END]]
+; MSSA:       if.end:
+; MSSA-NEXT:    [[VV:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT:    ret i32 [[VV]]
+;
+entry:
+  br i1 %c, label %if.end, label %if.then
+
+if.then:
+  call void @opaque(ptr %p)
+  %identical.l = load i32, ptr %p, align 4
+  store i32 %identical.l, ptr %q, align 4
+  br label %if.end
+
+if.end:
+  %l = load i32, ptr %p, align 4
+  ret i32 %l
+}
+
+declare void @opaque(ptr) nounwind willreturn
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
index cb4e07e..9b9bc68 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
@@ -60,8 +60,7 @@ define void @f_sadd_overflow(ptr %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 2147483645, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 2147483647
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
@@ -150,8 +149,7 @@ define void @f_uadd_overflow(ptr %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -6, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
@@ -243,10 +241,7 @@ define void @f_ssub_overflow(ptr nocapture %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -2147483642, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
@@ -339,10 +334,7 @@ define void @f_usub_overflow(ptr nocapture %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
diff --git a/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
new file mode 100644
index 0000000..b9c9228
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=indvars < %s | FileCheck %s
+
+define void @optimize_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_atomic(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_atomic(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store atomic i8 [[TMP4]], ptr [[ARRAYIDX7]] unordered, align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store atomic i8 %1, ptr %arrayidx7 unordered, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_volatile(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_volatile(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store volatile i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    call void @x(ptr null)
+; CHECK-NEXT:    store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  call void @x(ptr null)
+  store volatile i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @optimize_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.ubsantrap(i8 1)
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.ubsantrap(i8 1)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_arbitrary_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_arbitrary_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @noreturn_with_argmem(ptr [[FOO_ARR]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn_with_argmem(ptr %foo_arr)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT:    br i1 [[P]], label %[[FOR_BODY_CONT:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       [[FOR_BODY_CONT]]:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %p = call i1 @pred()
+  br i1 %p, label %for.body.cont, label %for.cond.cleanup.loopexit
+
+for.body.cont:                                         ; preds = %for.body.preheader, %if.end4
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn(ptr %foo_arr)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits2(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits2(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_BODY_CONT:.*]]
+; CHECK:       [[FOR_BODY_CONT]]:
+; CHECK-NEXT:    [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT:    br i1 [[P]], label %[[IF_END4]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %for.body.cont
+
+for.body.cont:                                         ; preds = %for.body.preheader, %if.end4
+  %p = call i1 @pred()
+  br i1 %p, label %if.end4, label %for.cond.cleanup.loopexit
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn(ptr %foo_arr)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[I_015_LCSSA:%.*]] = phi i32 [ [[I_015]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    call void @noreturn_with_i32(i32 [[I_015_LCSSA]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn_with_i32(i32 %i.015)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_load_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_load_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[I_015_LCSSA:%.*]] = load i8, ptr [[FOO_ARR]], align 1
+; CHECK-NEXT:    call void @noreturn_with_i8(i8 [[I_015_LCSSA]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  %r = load i8, ptr %foo_arr, align 1
+  call void @noreturn_with_i8(i8 %r)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+
+declare void @x(ptr noundef) local_unnamed_addr
+declare i1 @pred() local_unnamed_addr
+
+declare void @llvm.trap() #0
+declare void @noreturn(ptr) #0
+declare void @noreturn_with_i32(i32) #0
+declare void @noreturn_with_i8(i8) #0
+declare void @noreturn_with_argmem(ptr) #1
+
+attributes #0 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+attributes #1 = { cold noreturn nounwind memory(argmem: read) }
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index ed9fba3..22ab79d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -289,6 +289,225 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
   ret void
 }
 
+define void @deinterleave1_nxi64_factor3(ptr %ptr, <vscale x 4 x i64>* %s1, <vscale x 4 x i64>* %s2, <vscale x 4 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[TMP10]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP15]], <vscale x 4 x i64> [[TMP12]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP16]], <vscale x 4 x i64> [[TMP14]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 2
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP18]], ptr [[S1]], align 32
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP19]], ptr [[S2]], align 32
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP20]], ptr [[S3]], align 32
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 12 x i64>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64> %wide.vec)
+
+  %3 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 0
+  %4 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 1
+  %5 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 2
+
+  store <vscale x 4 x i64> %3, <vscale x 4 x i64>* %s1
+  store <vscale x 4 x i64> %4, <vscale x 4 x i64>* %s2
+  store <vscale x 4 x i64> %5, <vscale x 4 x i64>* %s3
+  ret void
+}
+
+define void @deinterleave2_nxi64_factor3(ptr %ptr, <vscale x 8 x i64>* %s1, <vscale x 8 x i64>* %s2, <vscale x 8 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT:    [[LDN3:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP16]], i64 4)
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP18]], i64 4)
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 2
+; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP20]], i64 4)
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT:    [[LDN4:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP22]])
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP17]], <vscale x 2 x i64> [[TMP23]], i64 6)
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP25]], i64 6)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP21]], <vscale x 2 x i64> [[TMP27]], i64 6)
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } poison, <vscale x 8 x i64> [[TMP24]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP29]], <vscale x 8 x i64> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP30]], <vscale x 8 x i64> [[TMP28]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 2
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP32]], ptr [[S1]], align 64
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP33]], ptr [[S2]], align 64
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP34]], ptr [[S3]], align 64
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 24 x i64>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64> %wide.vec)
+
+  %3 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 0
+  %4 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 1
+  %5 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 2
+
+  store <vscale x 8 x i64> %3, <vscale x 8 x i64>* %s1
+  store <vscale x 8 x i64> %4, <vscale x 8 x i64>* %s2
+  store <vscale x 8 x i64> %5, <vscale x 8 x i64>* %s3
+  ret void
+}
+
+define void @deinterleave_neg1_nxi64_factor3(ptr %ptr, <vscale x 1 x i64>* %s1, <vscale x 1 x i64>* %s2, <vscale x 1 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 3 x i64>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[LDN:%.*]] = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 2
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 3 x i64>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> %wide.vec)
+
+  %3 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 0
+  %4 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 1
+  %5 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 2
+
+  store <vscale x 1 x i64> %3, <vscale x 1 x i64>* %s1
+  store <vscale x 1 x i64> %4, <vscale x 1 x i64>* %s2
+  store <vscale x 1 x i64> %5, <vscale x 1 x i64>* %s3
+  ret void
+}
+
+define void @deinterleave_neg2_nxi8_factor3(ptr %ptr, <vscale x 8 x i8>* %s1, <vscale x 8 x i8>* %s2, <vscale x 8 x i8>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg2_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 24 x i8>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[LDN:%.*]] = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 2
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 24 x i8>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv12i8(<vscale x 24 x i8> %wide.vec)
+
+  %3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 0
+  %4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 1
+  %5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 2
+
+  store <vscale x 8 x i8> %3, <vscale x 8 x i8>* %s1
+  store <vscale x 8 x i8> %4, <vscale x 8 x i8>* %s2
+  store <vscale x 8 x i8> %5, <vscale x 8 x i8>* %s3
+  ret void
+}
+
+define void @interleave1_nxi64_factor3(ptr %ptr, <vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i64> [[S1:%.*]], <vscale x 8 x i64> [[S2:%.*]], <vscale x 8 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 2)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 4)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP9]])
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 6)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 6)
+; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 6)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP13]])
+; CHECK-NEXT:    ret void
+;
+  %interleave = tail call <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3)
+
+  store <vscale x 24 x i64> %interleave, ptr %ptr, align 4
+  ret void
+}
+
+define void @interleave2_nxi64_factor3(ptr %ptr, <vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x i64> [[S1:%.*]], <vscale x 4 x i64> [[S2:%.*]], <vscale x 4 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 2)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+  %interleave = tail call <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3)
+
+  store <vscale x 12 x i64> %interleave, ptr %ptr, align 4
+  ret void
+}
+
+define void @interleave_neg_nxi8_factor3(ptr %ptr, <vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) #0 {
+; CHECK-LABEL: define void @interleave_neg_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i8> [[S1:%.*]], <vscale x 8 x i8> [[S2:%.*]], <vscale x 8 x i8> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INTERLEAVE:%.*]] = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> [[S1]], <vscale x 8 x i8> [[S2]], <vscale x 8 x i8> [[S3]])
+; CHECK-NEXT:    store <vscale x 24 x i8> [[INTERLEAVE]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %interleave = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3)
+
+  store <vscale x 24 x i8> %interleave, ptr %ptr, align 4
+  ret void
+}
+
 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
@@ -312,4 +531,15 @@ declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <
 ; Larger interleaves to test 'legalization'
 declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
 
+; De-Interleaves with Factor=3
+declare { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64>)
+declare { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64>)
+declare { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8>)
+
+; Interleaves with Factor=3
+declare <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>)
+declare <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64>)
+
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 6cf11be..6fe6883 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -660,16 +660,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; COMMON-NEXT:    store i8 6, ptr [[TMP6]], align 1
 ; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; COMMON:       [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]]
+; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
 ; COMMON:       [[PRED_STORE_IF13]]:
 ; COMMON-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
 ; COMMON-NEXT:    store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT:    br label %[[EXIT1]]
+; COMMON-NEXT:    br label %[[EXIT]]
+; COMMON:       [[EXIT]]:
+; COMMON-NEXT:    br label %[[SCALAR_PH:.*]]
+; COMMON:       [[SCALAR_PH]]:
+; COMMON-NEXT:    br label %[[EXIT1:.*]]
 ; COMMON:       [[EXIT1]]:
-; COMMON-NEXT:    br label %[[SCALAR_PH1:.*]]
-; COMMON:       [[SCALAR_PH1]]:
-; COMMON-NEXT:    br [[EXIT:label %.*]]
-; COMMON:       [[SCALAR_PH:.*:]]
+; COMMON-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 93e71af..e3e4833 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -42,7 +42,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -80,7 +80,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -104,7 +104,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
@@ -167,13 +167,13 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -211,7 +211,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -235,7 +235,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
@@ -308,7 +308,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
 ; CHECK-NEXT:    store i64 0, ptr [[L]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -332,7 +332,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
 ; CHECK-NEXT:    store i64 0, ptr [[L]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 2
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV1]], 14
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index e424649..75b18ff 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -541,3 +541,22 @@ exit:                                 ; preds = %for.body
 ; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-VS1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS1: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
+; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VS2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS2: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index a6e0f8a..300f5d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -40,6 +40,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
 ; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -53,6 +54,15 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
 ; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
 ; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -262,6 +272,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
 ; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -275,6 +286,15 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]])
 ; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
 ; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -412,6 +432,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
 ; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -425,6 +446,15 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]])
 ; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
 ; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 8830ce3..5f79d02 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -38,8 +38,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -96,8 +97,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index d447517..f03f743 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -29,8 +29,9 @@ define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[COND:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
 ; CHECK-NEXT:    br i1 [[COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index b8f4e84..753847f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -33,8 +33,9 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
 ; CHECK-NEXT:    br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -87,8 +88,9 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i1 [[TMP8]], true
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index e046816..e84c0d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -67,7 +67,7 @@ define void @test_may_clobber(ptr %p) {
 ; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -111,7 +111,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -155,7 +155,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -207,7 +207,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -221,7 +221,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
 ; CHECK-NEXT:    store i16 0, ptr [[GEP_OFF]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 2fbc73e..c66d8d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -133,7 +133,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -237,7 +237,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 8, [[TMP9]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -346,7 +346,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; NOSTRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -360,7 +360,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -468,7 +468,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; NOSTRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -483,7 +483,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -640,7 +640,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -656,7 +656,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    store i32 [[Y0]], ptr [[Q1]], align 4
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -790,14 +790,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; STRIDED-NEXT:    [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]]
 ; STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META6:![0-9]+]]
+; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]]
 ; STRIDED-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
 ; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]]
 ; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; STRIDED-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; STRIDED:       middle.block:
 ; STRIDED-NEXT:    br label [[EXIT:%.*]]
 ; STRIDED:       scalar.ph:
@@ -813,7 +813,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    store i32 [[Y0]], ptr [[Q1]], align 4
 ; STRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; STRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; STRIDED:       exit:
 ; STRIDED-NEXT:    ret void
 ;
@@ -965,7 +965,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; NOSTRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -981,7 +981,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -1145,16 +1145,16 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[TMP18]]
 ; STRIDED-NEXT:    [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP18]]
 ; STRIDED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META13:![0-9]+]]
+; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META12:![0-9]+]]
 ; STRIDED-NEXT:    [[TMP30:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
-; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]], !noalias [[META13]]
+; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META15:![0-9]+]], !noalias [[META12]]
 ; STRIDED-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
 ; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; STRIDED-NEXT:    [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]]
 ; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]]
 ; STRIDED-NEXT:    [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]]
 ; STRIDED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; STRIDED:       middle.block:
 ; STRIDED-NEXT:    br label [[EXIT:%.*]]
 ; STRIDED:       scalar.ph:
@@ -1170,7 +1170,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]]
 ; STRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; STRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
 ; STRIDED:       exit:
 ; STRIDED-NEXT:    ret void
 ;
@@ -1318,7 +1318,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; NOSTRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NOSTRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       exit:
@@ -1402,7 +1402,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; STRIDED:       middle.block:
 ; STRIDED-NEXT:    br label [[LOOP:%.*]]
 ; STRIDED:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 0c22a9e..46daee4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -142,7 +142,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
@@ -267,7 +267,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       exit:
@@ -382,7 +382,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
@@ -508,7 +508,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       exit:
@@ -621,7 +621,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index bae97e5..c34417b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -129,7 +129,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -143,7 +143,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
@@ -204,7 +204,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
 ; TF-SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -218,7 +218,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
 ; TF-SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
@@ -269,7 +269,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -350,7 +350,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -399,7 +399,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -457,7 +457,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -499,7 +499,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -557,7 +557,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -608,7 +608,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -679,7 +679,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -731,7 +731,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -812,7 +812,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -860,7 +860,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -918,7 +918,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e11b1ad..27d5e64 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" }
 ; CHECK:     LV: Found uniform instruction:   {{%.*}} = icmp eq i32 {{%.*}}, 0
 ; CHECK-NOT: LV: Found uniform instruction:   {{%.*}} = load i32, ptr {{%.*}}, align 1
 ; CHECK:     LV: Found not uniform due to requiring predication:  {{%.*}} = load i32, ptr {{%.*}}, align 1
-; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}}
 ;
 ;
 @a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
@@ -215,8 +214,9 @@ define void @PR40816() #1 {
 ; FORCE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
 ; FORCE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FORCE:       [[MIDDLE_BLOCK]]:
-; FORCE-NEXT:    br [[RETURN:label %.*]]
-; FORCE:       [[SCALAR_PH:.*:]]
+; FORCE-NEXT:    br label %[[RETURN:.*]]
+; FORCE:       [[RETURN]]:
+; FORCE-NEXT:    ret void
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 9453ad7..725fa49 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -540,6 +540,8 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -551,14 +553,6 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1)
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 28de5c7..56f0b85 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -58,7 +58,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -72,7 +72,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK-NEXT:    store <4 x float> [[TMP21]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20
-; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -88,7 +88,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -132,14 +132,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -180,14 +180,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index 65c12a1..224ec4a6 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -34,8 +34,9 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -73,29 +74,28 @@ define void @test2(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 2
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 2
 ; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   %ptrint = ptrtoint ptr %a to i64
@@ -163,7 +163,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/operand-bundles.ll b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
index 131e41a..ce07364 100644
--- a/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
+++ b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
@@ -189,12 +189,12 @@ define void @assume_cold_operand_bundle(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "cold"() ]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "cold"() ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index ad8cd42..667df3a 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -88,11 +88,11 @@ define void @test2(ptr %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1
@@ -101,8 +101,6 @@ define void @test2(ptr %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD3]], splat (float 1.000000e+00)
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
index 4ec1683..4ec1683 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
index 5ebed10..5ebed10 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
diff --git a/llvm/unittests/Analysis/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp
index ee37fad..8c3a213 100644
--- a/llvm/unittests/Analysis/DXILResourceTest.cpp
+++ b/llvm/unittests/Analysis/DXILResourceTest.cpp
@@ -369,10 +369,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   {
     StructType *CBufStruct =
         StructType::create(Context, {Floatx4Ty, Floatx4Ty}, "cb0");
-    TargetExtType *CBufLayoutType =
-        llvm::TargetExtType::get(Context, "dx.Layout", CBufStruct, {32, 0, 16});
     ResourceTypeInfo RTI(
-        llvm::TargetExtType::get(Context, "dx.CBuffer", CBufLayoutType));
+        llvm::TargetExtType::get(Context, "dx.CBuffer", CBufStruct));
     EXPECT_EQ(RTI.getResourceClass(), ResourceClass::CBuffer);
     EXPECT_EQ(RTI.getCBufferSize(DL), 32u);
     EXPECT_EQ(RTI.getResourceKind(), ResourceKind::CBuffer);
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index ed7a4fe..3414190 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -99,6 +99,7 @@ static inline bool inheritsFrom(InstructionContext child,
             (noPrefix && inheritsFrom(child, IC_XS, noPrefix)));
   case IC_64BIT:
     return (inheritsFrom(child, IC_64BIT_REXW) ||
+            inheritsFrom(child, IC_64BIT_REX2) ||
             (noPrefix && inheritsFrom(child, IC_64BIT_OPSIZE, noPrefix)) ||
             (!AdSize64 && inheritsFrom(child, IC_64BIT_ADSIZE)) ||
             (noPrefix && inheritsFrom(child, IC_64BIT_XD, noPrefix)) ||
@@ -151,8 +152,10 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_REXW_XS:
   case IC_64BIT_REXW_OPSIZE:
   case IC_64BIT_REXW_ADSIZE:
-  case IC_64BIT_REX2:
+  case IC_64BIT_REX2_REXW:
     return false;
+  case IC_64BIT_REX2:
+    return inheritsFrom(child, IC_64BIT_REX2_REXW);
   case IC_VEX:
     return (VEX_LIG && WIG && inheritsFrom(child, IC_VEX_L_W)) ||
            (WIG && inheritsFrom(child, IC_VEX_W)) ||
@@ -980,9 +983,11 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
         if ((index & ATTR_EVEXB) && (index & ATTR_EVEXU))
           o << "_U";
       }
-    } else if ((index & ATTR_64BIT) && (index & ATTR_REX2))
+    } else if ((index & ATTR_64BIT) && (index & ATTR_REX2)) {
       o << "IC_64BIT_REX2";
-    else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
+      if (index & ATTR_REXW)
+        o << "_REXW";
+    } else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
       o << "IC_64BIT_REXW_XS";
     else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XD))
       o << "IC_64BIT_REXW_XD";
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index e87a1c9..a006888 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -365,6 +365,8 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_64BIT_XD;
     else if (OpPrefix == X86Local::XS)
       insnContext = IC_64BIT_XS;
+    else if (HasREX_W && ExplicitREX2Prefix)
+      insnContext = IC_64BIT_REX2_REXW;
     else if (ExplicitREX2Prefix)
       insnContext = IC_64BIT_REX2;
     else if (HasREX_W)
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 9b59af7..830c394 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -61,7 +61,7 @@ LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
 
 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
 /// in it from outermost to innermost.
-bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops);
+[[maybe_unused]] bool isPerfectlyNested(ArrayRef<AffineForOp> loops);
 
 /// Get perfectly nested sequence of loops starting at root of loop nest
 /// (the first op being another AffineFor, and the second op - a terminator).
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 8f87235..b8aa497 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -183,6 +183,10 @@ static constexpr StringLiteral getRoutineInfoAttrName() {
   return StringLiteral("acc.routine_info");
 }
 
+static constexpr StringLiteral getVarNameAttrName() {
+  return VarNameAttr::name;
+}
+
 static constexpr StringLiteral getCombinedConstructsAttrName() {
   return CombinedConstructsTypeAttr::name;
 }
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index fecf81b..1eaa21b4 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -415,6 +415,13 @@ def OpenACC_ConstructResource : Resource<"::mlir::acc::ConstructResource">;
 // Define a resource for the OpenACC current device setting.
 def OpenACC_CurrentDeviceIdResource : Resource<"::mlir::acc::CurrentDeviceIdResource">;
 
+// Attribute for saving variable names - this can be attached to non-acc-dialect
+// operations in order to ensure the name is preserved.
+def OpenACC_VarNameAttr : OpenACC_Attr<"VarName", "var_name"> {
+  let parameters = (ins StringRefParameter<"">:$name);
+  let assemblyFormat = "`<` $name `>`";
+}
+
 // Used for data specification in data clauses (2.7.1).
 // Either (or both) extent and upperbound must be specified.
 def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
index 6736bc8..93e9e3d 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
@@ -73,12 +73,18 @@ def OpenACC_PointerLikeTypeInterface : TypeInterface<"PointerLikeType"> {
     InterfaceMethod<
       /*description=*/[{
         Generates allocation operations for the pointer-like type. It will create
-        an allocate that produces memory space for an instance of the current type.
+        an allocate operation that produces memory space for an instance of the
+        current type.
 
         The `varName` parameter is optional and can be used to provide a name
-        for the allocated variable. If the current type is represented
-        in a way that it does not capture the pointee type, `varType` must be
-        passed in to provide the necessary type information.
+        for the allocated variable. When provided, it must be used by the
+        implementation; and if the implementing dialect does not have its own
+        way to save it, the discardable `acc.var_name` attribute from the acc
+        dialect will be used.
+
+        If the current type is represented in a way that it does not capture
+        the pointee type, `varType` must be passed in to provide the necessary
+        type information.
 
         The `originalVar` parameter is optional but enables support for dynamic
         types (e.g., dynamic memrefs). When provided, implementations can extract
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 5695d5d..19a5231 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -712,10 +712,14 @@ def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> {
       return getAttrs().contains(name);
     }
 
-    ArrayAttr getStrides() {
+    ArrayAttr getStrideAttr() {
       return getAttrs().getAs<ArrayAttr>("stride");
     }
 
+    ArrayAttr getBlockAttr() {
+      return getAttrs().getAs<ArrayAttr>("block");
+    }
+
   }];
 
 }
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 73f9061..426377f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1298,14 +1298,14 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure,
 }
 
 def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
-                              AllElementTypesMatch<["mem_desc", "res"]>,
-                              AllRanksMatch<["mem_desc", "res"]>]>  {
+                              AllElementTypesMatch<["mem_desc", "res"]>]>  {
   let arguments = (ins XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<UnitAttr>:$subgroup_block_io,
     OptionalAttr<DistributeLayoutAttr>:$layout
   );
-  let results = (outs XeGPU_ValueType:$res);
+  let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);  
   let assemblyFormat = [{
     $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `` `:` type(operands) `->` type(results)
@@ -1319,6 +1319,9 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     Arguments:
      - `mem_desc`: the memory descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
+     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
+                 lowered to a subgroup block load. When this attribute is present, 
+                 the offsets are subgroup-uniform across all lanes.
      - `layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
                  LayoutAttr or SliceAttr.
@@ -1336,7 +1339,10 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     }
 
     ArrayRef<int64_t> getDataShape() {
-      return getRes().getType().getShape();
+      auto resTy = getRes().getType();
+      if (auto vecTy = llvm::dyn_cast<VectorType>(resTy))
+        return vecTy.getShape();
+      return {};
     }
   }];
 
@@ -1344,13 +1350,13 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
 }
 
 def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
-                              AllElementTypesMatch<["mem_desc", "data"]>,
-                              AllRanksMatch<["mem_desc", "data"]>]> {
+                              AllElementTypesMatch<["mem_desc", "data"]>]> {
   let arguments = (ins
-    XeGPU_ValueType:$data,
+    AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
     XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<UnitAttr>:$subgroup_block_io,
     OptionalAttr<DistributeLayoutAttr>:$layout
   );
   let assemblyFormat = [{ $data `,` $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
@@ -1364,6 +1370,9 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `mem_desc`: the memory descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
      - `data`: the values to be stored in the matrix.
+     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
+                 lowered to a subgroup block store. When this attribute is present, 
+                 the offsets are subgroup-uniform across all lanes.     
      - `layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
                  LayoutAttr or SliceAttr.
@@ -1378,7 +1387,10 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     }
 
     ArrayRef<int64_t> getDataShape() {
-      return getData().getType().getShape();
+      auto DataTy = getData().getType();
+      if (auto vecTy = llvm::dyn_cast<VectorType>(DataTy))
+        return vecTy.getShape();
+      return {};
     }
 
   }];
@@ -1386,41 +1398,4 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
   let hasVerifier = 1;
 }
 
-def XeGPU_MemDescSubviewOp: XeGPU_Op<"mem_desc_subview",
-          [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> {
-  let description = [{
-    Creates a subview of a memory descriptor. The resulting memory descriptor can have
-    a lower rank than the source; in this case, the result dimensions correspond to the
-    higher-order dimensions of the source memory descriptor.
-
-    Arguments:
-     - `src` : a memory descriptor.
-     - `offsets` : the coordinates within the matrix the subview will be created from.
-
-    Results:
-    - `res` : a memory descriptor with smaller size.
-
-  }];
-  let arguments = (ins XeGPU_MemDesc:$src,
-                       Variadic<Index>:$offsets,
-                       DenseI64ArrayAttr:$const_offsets);
-  let results = (outs XeGPU_MemDesc:$res);
-  let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict
-                         attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}];
-  let builders = [
-    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets)>
-  ];
-
-  let extraClassDeclaration = [{
-    mlir::Value getViewSource() { return getSrc(); }
-
-    SmallVector<OpFoldResult> getMixedOffsets() {
-      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
-    }
-  }];
-
-  let hasVerifier = 1;
-}
-
-
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 84902b2..b1196fb 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -237,12 +237,11 @@ def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "m
       return MemDescType::get(getContext(), shape.value_or(getShape()), elementType, getMemLayout());
     }
 
-    ArrayAttr getStrides() {
+    ArrayAttr getStrideAttr() {
       auto layout = getMemLayout();
       if (layout && layout.hasAttr("stride")) {
-        return layout.getStrides();
+        return layout.getStrideAttr();
       }
-
       // derive and return default strides
       SmallVector<int64_t> defaultStrides;
       llvm::append_range(defaultStrides, getShape().drop_front());
@@ -250,6 +249,63 @@ def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "m
       Builder builder(getContext());
       return builder.getI64ArrayAttr(defaultStrides);
     }
+
+    ArrayAttr getBlockAttr() {
+      auto layout = getMemLayout();
+      if (layout && layout.hasAttr("block")) {
+        return layout.getBlockAttr();
+      }
+      Builder builder(getContext());
+      return builder.getI64ArrayAttr({});
+    }
+
+    /// Heuristic to determine if the MemDesc uses column-major layout,
+    /// based on the rank and the value of the first stride dimension.
+    bool isColMajor() {
+      auto dim0 = dyn_cast<IntegerAttr>(getStrideAttr()[0]);
+      return getRank() == 2 && dim0.getInt() == 1;
+    }
+
+    // Get the Blocking shape for a MemDescType, Which is represented
+    // as an attribute in MemDescType. By default it is the shape
+    // of the mdescTy
+    SmallVector<int64_t> getBlockShape() {
+      SmallVector<int64_t> size(getShape());
+      ArrayAttr blockAttr = getBlockAttr();
+      if (!blockAttr.empty()) {
+        size.clear();
+        for (auto attr : blockAttr.getValue()) {
+          size.push_back(cast<IntegerAttr>(attr).getInt());
+        }
+      }
+      return size;
+    }
+
+    // Get strides as vector of integer. 
+    // If it contains block attribute, the strides are blocked strides.
+    //
+    // The blocking is applied to the base matrix shape derived from the
+    // memory descriptor's stride information. If the matrix described by
+    // the memory descriptor is not contiguous, it is assumed that the base
+    // matrix is contiguous and follows the same memory layout.
+    //
+    // It first computes the original matrix shape using the stride info,
+    // then computes the number of blocks in each dimension of original shape,
+    // then compute the outer block shape and stride,
+    // then combines the inner and outer block shape and stride
+    // e.g. for `mem_desc<32x256xf16, @block=[16, 8], @strides=[1, 32]>`
+    // its memory layout tuple is ([2,32,16,8],[128,256,1,16])
+    // for  `mem_desc<256x32xf16, @block=[8, 16]>` with default @stride[32, 1]
+    // its memory layout tuple is ([32,2,8,16],[256,128,16,1])
+    SmallVector<int64_t> getStrideShape();
+    
+    /// Generates instructions to compute the linearize offset 
+    //  if the memory descriptor is blocked, it returns linearize offset based on the blocked layout
+    //  the strides of memory descriptor is always considered regardless of blocked or not
+    Value getLinearOffsets(OpBuilder &builder,
+               Location loc, ArrayRef<OpFoldResult> offsets);
+
+
   }];
 
   let hasCustomAssemblyFormat = true;
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index 30ce1fb..6588b53 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -1244,8 +1244,9 @@ bool FlatLinearValueConstraints::areVarsAlignedWithOther(
 
 /// Checks if the SSA values associated with `cst`'s variables in range
 /// [start, end) are unique.
-static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
-    const FlatLinearValueConstraints &cst, unsigned start, unsigned end) {
+[[maybe_unused]] static bool
+areVarsUnique(const FlatLinearValueConstraints &cst, unsigned start,
+              unsigned end) {
 
   assert(start <= cst.getNumDimAndSymbolVars() &&
          "Start position out of bounds");
@@ -1267,14 +1268,14 @@ static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
 }
 
 /// Checks if the SSA values associated with `cst`'s variables are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static bool
 areVarsUnique(const FlatLinearValueConstraints &cst) {
   return areVarsUnique(cst, 0, cst.getNumDimAndSymbolVars());
 }
 
 /// Checks if the SSA values associated with `cst`'s variables of kind `kind`
 /// are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static bool
 areVarsUnique(const FlatLinearValueConstraints &cst, VarKind kind) {
 
   if (kind == VarKind::SetDim)
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index a1cbe29..547a4c2 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -34,7 +34,7 @@ using Direction = Simplex::Direction;
 const int nullIndex = std::numeric_limits<int>::max();
 
 // Return a + scale*b;
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static SmallVector<DynamicAPInt, 8>
 scaleAndAddForAssert(ArrayRef<DynamicAPInt> a, const DynamicAPInt &scale,
                      ArrayRef<DynamicAPInt> b) {
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 7b17106..06d0256 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2730,6 +2730,17 @@ public:
         operation->get(), toMlirStringRef(name)));
   }
 
+  static void
+  forEachAttr(MlirOperation op,
+              llvm::function_ref<void(MlirStringRef, MlirAttribute)> fn) {
+    intptr_t n = mlirOperationGetNumAttributes(op);
+    for (intptr_t i = 0; i < n; ++i) {
+      MlirNamedAttribute na = mlirOperationGetAttribute(op, i);
+      MlirStringRef name = mlirIdentifierStr(na.name);
+      fn(name, na.attribute);
+    }
+  }
+
   static void bind(nb::module_ &m) {
     nb::class_<PyOpAttributeMap>(m, "OpAttributeMap")
         .def("__contains__", &PyOpAttributeMap::dunderContains)
@@ -2737,7 +2748,50 @@ public:
         .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed)
         .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed)
         .def("__setitem__", &PyOpAttributeMap::dunderSetItem)
-        .def("__delitem__", &PyOpAttributeMap::dunderDelItem);
+        .def("__delitem__", &PyOpAttributeMap::dunderDelItem)
+        .def("__iter__",
+             [](PyOpAttributeMap &self) {
+               nb::list keys;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef name, MlirAttribute) {
+                     keys.append(nb::str(name.data, name.length));
+                   });
+               return nb::iter(keys);
+             })
+        .def("keys",
+             [](PyOpAttributeMap &self) {
+               nb::list out;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef name, MlirAttribute) {
+                     out.append(nb::str(name.data, name.length));
+                   });
+               return out;
+             })
+        .def("values",
+             [](PyOpAttributeMap &self) {
+               nb::list out;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef, MlirAttribute attr) {
+                     out.append(PyAttribute(self.operation->getContext(), attr)
+                                    .maybeDownCast());
+                   });
+               return out;
+             })
+        .def("items", [](PyOpAttributeMap &self) {
+          nb::list out;
+          PyOpAttributeMap::forEachAttr(
+              self.operation->get(),
+              [&](MlirStringRef name, MlirAttribute attr) {
+                out.append(nb::make_tuple(
+                    nb::str(name.data, name.length),
+                    PyAttribute(self.operation->getContext(), attr)
+                        .maybeDownCast()));
+              });
+          return out;
+        });
   }
 
 private:
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
index 84b2580..dd9edc4 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
@@ -21,6 +21,7 @@ add_mlir_conversion_library(MLIRXeGPUToXeVM
   MLIRIndexDialect
   MLIRSCFDialect
   MLIRXeGPUDialect
+  MLIRXeGPUUtils
   MLIRPass
   MLIRTransforms
   MLIRSCFTransforms
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index ddcbc44..fcbf66d 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
@@ -63,6 +64,7 @@ static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) {
   case xegpu::MemorySpace::SLM:
     return static_cast<int>(xevm::AddrSpace::SHARED);
   }
+  llvm_unreachable("Unknown XeGPU memory space");
 }
 
 // Get same bitwidth flat vector type of new element type.
@@ -186,6 +188,7 @@ class CreateNdDescToXeVMPattern
     int64_t rank = mixedSizes.size();
     if (rank != 2)
       return rewriter.notifyMatchFailure(op, "Expected 2D shape.");
+
     auto sourceTy = source.getType();
     auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
     // If source is a memref, we need to extract the aligned pointer as index.
@@ -364,10 +367,11 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
 
 // Add a builder that creates
 // offset * elemByteSize + baseAddr
-static Value addOffset(ConversionPatternRewriter &rewriter, Location loc,
-                       Value baseAddr, Value offset, int64_t elemByteSize) {
+static Value addOffsetToBaseAddr(ConversionPatternRewriter &rewriter,
+                                 Location loc, Value baseAddr, Value offset,
+                                 int64_t elemByteSize) {
   Value byteSize = arith::ConstantIntOp::create(
-      rewriter, loc, rewriter.getI64Type(), elemByteSize);
+      rewriter, loc, baseAddr.getType(), elemByteSize);
   Value byteOffset = arith::MulIOp::create(rewriter, loc, offset, byteSize);
   Value newAddr = arith::AddIOp::create(rewriter, loc, baseAddr, byteOffset);
   return newAddr;
@@ -443,7 +447,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
       // If offset is provided, we add them to the base pointer.
       // Offset is in number of elements, we need to multiply by
       // element byte size.
-      basePtrI64 = addOffset(rewriter, loc, basePtrI64, offset, elemByteSize);
+      basePtrI64 =
+          addOffsetToBaseAddr(rewriter, loc, basePtrI64, offset, elemByteSize);
     }
     // Convert base pointer (i64) to LLVM pointer type.
     Value basePtrLLVM =
@@ -506,6 +511,147 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
   }
 };
 
+// Lower xegpu::CreateMemDescOp to memref::ViewOp. Since SLM access instructions
+// on Xe2 and Xe3 operate on 32-bit or 64-bit units, all data types smaller than
+// 32 bits will be converted to 32 bits.
+class CreateMemDescOpPattern final
+    : public OpConversionPattern<xegpu::CreateMemDescOp> {
+public:
+  using OpConversionPattern<xegpu::CreateMemDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto resTy = op.getMemDesc();
+
+    // Create the result MemRefType with the same shape, element type, and
+    // memory space
+    auto newResTy = getTypeConverter()->convertType<MemRefType>(resTy);
+
+    Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0);
+    auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy,
+                                         op.getSource(), zero, ValueRange());
+    rewriter.replaceOp(op, viewOp);
+    return success();
+  }
+};
+
+template <typename OpType,
+          typename = std::enable_if_t<llvm::is_one_of<
+              OpType, xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>::value>>
+class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
+  using OpConversionPattern<OpType>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(op, "Expected offset to be provided.");
+
+    auto loc = op.getLoc();
+    auto ctxt = rewriter.getContext();
+    Value basePtrStruct = adaptor.getMemDesc();
+    Value mdescVal = op.getMemDesc();
+    // Load result or Store value Type can be vector or scalar.
+    Value data;
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>)
+      data = op.getResult();
+    else
+      data = adaptor.getData();
+    VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType());
+    if (!valOrResVecTy)
+      valOrResVecTy = VectorType::get(1, data.getType());
+
+    int64_t elemBitWidth =
+        valOrResVecTy.getElementType().getIntOrFloatBitWidth();
+    // Element type must be multiple of 8 bits.
+    if (elemBitWidth % 8 != 0)
+      return rewriter.notifyMatchFailure(
+          op, "Expected element type bit width to be multiple of 8.");
+    int64_t elemByteSize = elemBitWidth / 8;
+
+    // Default memory space is SLM.
+    LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get(
+        ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::SLM));
+
+    auto mdescTy = cast<xegpu::MemDescType>(mdescVal.getType());
+
+    Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create(
+        rewriter, loc, basePtrStruct);
+
+    // Convert base pointer (ptr) to i32
+    Value basePtrI32 = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), basePtrLLVM);
+
+    Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets);
+    linearOffset = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), linearOffset);
+    basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset,
+                                     elemByteSize);
+
+    // convert base pointer (i32) to LLVM pointer type
+    basePtrLLVM =
+        LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32);
+
+    if (op.getSubgroupBlockIoAttr()) {
+      // if the attribute 'subgroup_block_io' is set to true, it lowers to
+      // xevm.blockload
+
+      Type intElemTy = rewriter.getIntegerType(elemBitWidth);
+      VectorType intVecTy =
+          VectorType::get(valOrResVecTy.getShape(), intElemTy);
+
+      if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+        Value loadOp =
+            xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM);
+        if (intVecTy != valOrResVecTy) {
+          loadOp =
+              vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp);
+        }
+        rewriter.replaceOp(op, loadOp);
+      } else {
+        Value dataToStore = adaptor.getData();
+        if (valOrResVecTy != intVecTy) {
+          dataToStore =
+              vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore);
+        }
+        xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore,
+                                   nullptr);
+        rewriter.eraseOp(op);
+      }
+      return success();
+    }
+
+    if (valOrResVecTy.getNumElements() >= 1) {
+      auto chipOpt = xegpu::getChipStr(op);
+      if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) {
+        // the lowering for chunk load only works for pvc and bmg
+        return rewriter.notifyMatchFailure(
+            op, "The lowering is specific to pvc or bmg.");
+      }
+    }
+
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+      // if the size of valOrResVecTy is 1, it lowers to a scalar load/store
+      // operation. LLVM load/store does not support vector of size 1, so we
+      // need to handle this case separately.
+      auto scalarTy = valOrResVecTy.getElementType();
+      LLVM::LoadOp loadOp;
+      if (valOrResVecTy.getNumElements() == 1)
+        loadOp = LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM);
+      else
+        loadOp =
+            LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM);
+      rewriter.replaceOp(op, loadOp);
+    } else {
+      LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM);
+      rewriter.eraseOp(op);
+    }
+    return success();
+  }
+};
+
 class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -548,8 +694,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
                 op, "Expected element type bit width to be multiple of 8.");
           elemByteSize = elemBitWidth / 8;
         }
-        basePtrI64 =
-            addOffset(rewriter, loc, basePtrI64, offsets, elemByteSize);
+        basePtrI64 = addOffsetToBaseAddr(rewriter, loc, basePtrI64, offsets,
+                                         elemByteSize);
       }
     }
     // Default memory space is global.
@@ -786,6 +932,13 @@ struct ConvertXeGPUToXeVMPass
       auto i32Type = IntegerType::get(&getContext(), 32);
       return VectorType::get(8, i32Type);
     });
+    // Convert MemDescType into flattened MemRefType for SLM
+    typeConverter.addConversion([&](xegpu::MemDescType type) -> Type {
+      Type elemTy = type.getElementType();
+      int numElems = type.getNumElements();
+      return MemRefType::get(numElems, elemTy, AffineMap(), 3);
+    });
+
     typeConverter.addConversion([&](MemRefType type) -> Type {
       // Convert MemRefType to i64 type.
       return IntegerType::get(&getContext(), 64);
@@ -940,6 +1093,9 @@ void mlir::populateXeGPUToXeVMConversionPatterns(
                LoadStoreToXeVMPattern<xegpu::LoadGatherOp>,
                LoadStoreToXeVMPattern<xegpu::StoreScatterOp>>(
       typeConverter, patterns.getContext());
+  patterns.add<LoadStoreMatrixToXeVMPattern<xegpu::LoadMatrixOp>,
+               LoadStoreMatrixToXeVMPattern<xegpu::StoreMatrixOp>,
+               CreateMemDescOpPattern>(typeConverter, patterns.getContext());
   patterns.add<FenceToXeVMPattern, DpasToXeVMPattern>(typeConverter,
                                                       patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index f405d0c..c798adb 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -757,13 +757,13 @@ struct PackScales final : OpRewritePattern<ScaledMFMAOp> {
         offset = numElements - 4l;
       }
       Type scaleSrcElemType = scaleSrcType.getElementType();
-      auto newSrcType = VectorType::get(SmallVector<int64_t>({numElements}),
-                                        scaleSrcElemType);
+      auto newSrcType =
+          VectorType::get(ArrayRef{numElements}, scaleSrcElemType);
       Value newScaleSrc =
           vector::ShapeCastOp::create(rewriter, loc, newSrcType, scaleSrc);
       auto extract = vector::ExtractStridedSliceOp::create(
-          rewriter, loc, newScaleSrc, ArrayRef<int64_t>{offset},
-          ArrayRef<int64_t>{size}, ArrayRef<int64_t>{1});
+          rewriter, loc, newScaleSrc, ArrayRef{offset}, ArrayRef{size},
+          ArrayRef{int64_t(1)});
       rewriter.modifyOpInPlace(op, [&] {
         op->setOperand(opIdx, extract);
         setOpsel(opIdx, opsel);
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 7e5ce26..749e2ba 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -125,9 +125,9 @@ static bool remainsLegalAfterInline(OpTy op, Region *src, Region *dest,
 //  Use "unused attribute" marker to silence clang-tidy warning stemming from
 //  the inability to see through "llvm::TypeSwitch".
 template <>
-bool LLVM_ATTRIBUTE_UNUSED remainsLegalAfterInline(AffineApplyOp op,
-                                                   Region *src, Region *dest,
-                                                   const IRMapping &mapping) {
+[[maybe_unused]] bool remainsLegalAfterInline(AffineApplyOp op, Region *src,
+                                              Region *dest,
+                                              const IRMapping &mapping) {
   // If it's a valid dimension, we need to check that it remains so.
   if (isValidDim(op.getResult(), src))
     return remainsLegalAfterInline(
@@ -1032,8 +1032,8 @@ static void simplifyMinOrMaxExprWithOperands(AffineMap &map,
 /// Simplify the map while exploiting information on the values in `operands`.
 //  Use "unused attribute" marker to silence warning stemming from the inability
 //  to see through the template expansion.
-static void LLVM_ATTRIBUTE_UNUSED
-simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
+[[maybe_unused]] static void simplifyMapWithOperands(AffineMap &map,
+                                                     ArrayRef<Value> operands) {
   assert(map.getNumInputs() == operands.size() && "invalid operands for map");
   SmallVector<AffineExpr> newResults;
   newResults.reserve(map.getNumResults());
@@ -1125,6 +1125,141 @@ static LogicalResult replaceAffineMinBoundingBoxExpression(AffineMinOp minOp,
   return success(*map != initialMap);
 }
 
+/// Recursively traverse `e`. If `e` or one of its sub-expressions has the form
+/// e1 + e2 + ... + eK, where the e_i are a super(multi)set of `exprsToRemove`,
+/// place a map between e and `newVal` + sum({e1, e2, .. eK} - exprsToRemove)
+/// into `replacementsMap`. If no entries were added to `replacementsMap`,
+/// nothing was found.
+static void shortenAddChainsContainingAll(
+    AffineExpr e, const llvm::SmallDenseSet<AffineExpr, 4> &exprsToRemove,
+    AffineExpr newVal, DenseMap<AffineExpr, AffineExpr> &replacementsMap) {
+  auto binOp = dyn_cast<AffineBinaryOpExpr>(e);
+  if (!binOp)
+    return;
+  AffineExpr lhs = binOp.getLHS();
+  AffineExpr rhs = binOp.getRHS();
+  if (binOp.getKind() != AffineExprKind::Add) {
+    shortenAddChainsContainingAll(lhs, exprsToRemove, newVal, replacementsMap);
+    shortenAddChainsContainingAll(rhs, exprsToRemove, newVal, replacementsMap);
+    return;
+  }
+  SmallVector<AffineExpr> toPreserve;
+  llvm::SmallDenseSet<AffineExpr, 4> ourTracker(exprsToRemove);
+  AffineExpr thisTerm = rhs;
+  AffineExpr nextTerm = lhs;
+
+  while (thisTerm) {
+    if (!ourTracker.erase(thisTerm)) {
+      toPreserve.push_back(thisTerm);
+      shortenAddChainsContainingAll(thisTerm, exprsToRemove, newVal,
+                                    replacementsMap);
+    }
+    auto nextBinOp = dyn_cast_if_present<AffineBinaryOpExpr>(nextTerm);
+    if (!nextBinOp || nextBinOp.getKind() != AffineExprKind::Add) {
+      thisTerm = nextTerm;
+      nextTerm = AffineExpr();
+    } else {
+      thisTerm = nextBinOp.getRHS();
+      nextTerm = nextBinOp.getLHS();
+    }
+  }
+  if (!ourTracker.empty())
+    return;
+  // We reverse the terms to be preserved here in order to preserve
+  // associativity between them.
+  AffineExpr newExpr = newVal;
+  for (AffineExpr preserved : llvm::reverse(toPreserve))
+    newExpr = newExpr + preserved;
+  replacementsMap.insert({e, newExpr});
+}
+
+/// If this map contains of the expression `x_1 + x_1 * C_1 + ... x_n * C_N +
+/// ...` (not necessarily in order) where the set of the `x_i` is the set of
+/// outputs of an `affine.delinearize_index` whos inverse is that expression,
+/// replace that expression with the input of that delinearize_index op.
+///
+/// `unitDimInput` is the input that was detected as the potential start to this
+/// replacement chain - if it isn't the rightmost result of the delinearization,
+/// this method fails. (This is intended to ensure we don't have redundant scans
+/// over the same expression).
+///
+/// While this currently only handles delinearizations with a constant basis,
+/// that isn't a fundamental limitation.
+///
+/// This is a utility function for `replaceDimOrSym` below.
+static LogicalResult replaceAffineDelinearizeIndexInverseExpression(
+    AffineDelinearizeIndexOp delinOp, Value resultToReplace, AffineMap *map,
+    SmallVectorImpl<Value> &dims, SmallVectorImpl<Value> &syms) {
+  if (!delinOp.getDynamicBasis().empty())
+    return failure();
+  if (resultToReplace != delinOp.getMultiIndex().back())
+    return failure();
+
+  MLIRContext *ctx = delinOp.getContext();
+  SmallVector<AffineExpr> resToExpr(delinOp.getNumResults(), AffineExpr());
+  for (auto [pos, dim] : llvm::enumerate(dims)) {
+    auto asResult = dyn_cast_if_present<OpResult>(dim);
+    if (!asResult)
+      continue;
+    if (asResult.getOwner() == delinOp.getOperation())
+      resToExpr[asResult.getResultNumber()] = getAffineDimExpr(pos, ctx);
+  }
+  for (auto [pos, sym] : llvm::enumerate(syms)) {
+    auto asResult = dyn_cast_if_present<OpResult>(sym);
+    if (!asResult)
+      continue;
+    if (asResult.getOwner() == delinOp.getOperation())
+      resToExpr[asResult.getResultNumber()] = getAffineSymbolExpr(pos, ctx);
+  }
+  if (llvm::is_contained(resToExpr, AffineExpr()))
+    return failure();
+
+  bool isDimReplacement = llvm::all_of(resToExpr, llvm::IsaPred<AffineDimExpr>);
+  int64_t stride = 1;
+  llvm::SmallDenseSet<AffineExpr, 4> expectedExprs;
+  // This isn't zip_equal since sometimes the delinearize basis is missing a
+  // size for the first result.
+  for (auto [binding, size] : llvm::zip(
+           llvm::reverse(resToExpr), llvm::reverse(delinOp.getStaticBasis()))) {
+    expectedExprs.insert(binding * getAffineConstantExpr(stride, ctx));
+    stride *= size;
+  }
+  if (resToExpr.size() != delinOp.getStaticBasis().size())
+    expectedExprs.insert(resToExpr[0] * stride);
+
+  DenseMap<AffineExpr, AffineExpr> replacements;
+  AffineExpr delinInExpr = isDimReplacement
+                               ? getAffineDimExpr(dims.size(), ctx)
+                               : getAffineSymbolExpr(syms.size(), ctx);
+
+  for (AffineExpr e : map->getResults())
+    shortenAddChainsContainingAll(e, expectedExprs, delinInExpr, replacements);
+  if (replacements.empty())
+    return failure();
+
+  AffineMap origMap = *map;
+  if (isDimReplacement)
+    dims.push_back(delinOp.getLinearIndex());
+  else
+    syms.push_back(delinOp.getLinearIndex());
+  *map = origMap.replace(replacements, dims.size(), syms.size());
+
+  // Blank out dead dimensions and symbols
+  for (AffineExpr e : resToExpr) {
+    if (auto d = dyn_cast<AffineDimExpr>(e)) {
+      unsigned pos = d.getPosition();
+      if (!map->isFunctionOfDim(pos))
+        dims[pos] = nullptr;
+    }
+    if (auto s = dyn_cast<AffineSymbolExpr>(e)) {
+      unsigned pos = s.getPosition();
+      if (!map->isFunctionOfSymbol(pos))
+        syms[pos] = nullptr;
+    }
+  }
+  return success();
+}
+
 /// Replace all occurrences of AffineExpr at position `pos` in `map` by the
 /// defining AffineApplyOp expression and operands.
 /// When `dimOrSymbolPosition < dims.size()`, AffineDimExpr@[pos] is replaced.
@@ -1157,6 +1292,11 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
                                                  syms);
   }
 
+  if (auto delinOp = v.getDefiningOp<affine::AffineDelinearizeIndexOp>()) {
+    return replaceAffineDelinearizeIndexInverseExpression(delinOp, v, map, dims,
+                                                          syms);
+  }
+
   auto affineApply = v.getDefiningOp<AffineApplyOp>();
   if (!affineApply)
     return failure();
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index cd216ef..4743941 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -1357,7 +1357,7 @@ bool mlir::affine::isValidLoopInterchangePermutation(
 
 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
 /// in it from outermost to innermost.
-bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] bool
 mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
   assert(!loops.empty() && "no loops provided");
 
@@ -1920,8 +1920,7 @@ generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
   return copyNestRoot;
 }
 
-static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
-emitRemarkForBlock(Block &block) {
+[[maybe_unused]] static InFlightDiagnostic emitRemarkForBlock(Block &block) {
   return block.getParentOp()->emitRemark();
 }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index 01a16ce..ac35eea 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -134,10 +134,10 @@ static void printExtTypeParams(AsmPrinter &p, ArrayRef<Type> typeParams,
 
 /// These are unused for now.
 /// TODO: Move over to these once more types have been migrated to TypeDef.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #include "mlir/Dialect/LLVMIR/LLVMTypeInterfaces.cpp.inc"
 
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 642ced9..90cbbd8 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -40,6 +40,16 @@ static bool isScalarLikeType(Type type) {
   return type.isIntOrIndexOrFloat() || isa<ComplexType>(type);
 }
 
+/// Helper function to attach the `VarName` attribute to an operation
+/// if a variable name is provided.
+static void attachVarNameAttr(Operation *op, OpBuilder &builder,
+                              StringRef varName) {
+  if (!varName.empty()) {
+    auto varNameAttr = acc::VarNameAttr::get(builder.getContext(), varName);
+    op->setAttr(acc::getVarNameAttrName(), varNameAttr);
+  }
+}
+
 struct MemRefPointerLikeModel
     : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
                                             MemRefType> {
@@ -83,7 +93,9 @@ struct MemRefPointerLikeModel
     // then we can generate an alloca operation.
     if (memrefTy.hasStaticShape()) {
       needsFree = false; // alloca doesn't need deallocation
-      return memref::AllocaOp::create(builder, loc, memrefTy).getResult();
+      auto allocaOp = memref::AllocaOp::create(builder, loc, memrefTy);
+      attachVarNameAttr(allocaOp, builder, varName);
+      return allocaOp.getResult();
     }
 
     // For dynamic memrefs, extract sizes from the original variable if
@@ -103,8 +115,10 @@ struct MemRefPointerLikeModel
         // Static dimensions are handled automatically by AllocOp
       }
       needsFree = true; // alloc needs deallocation
-      return memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes)
-          .getResult();
+      auto allocOp =
+          memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes);
+      attachVarNameAttr(allocOp, builder, varName);
+      return allocOp.getResult();
     }
 
     // TODO: Unranked not yet supported.
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
index a1711a6..069191c 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
@@ -143,8 +143,8 @@ void VarInfo::setNum(Var::Num n) {
 
 /// Helper function for `assertUsageConsistency` to better handle SMLoc
 /// mismatches.
-LLVM_ATTRIBUTE_UNUSED static llvm::SMLoc
-minSMLoc(AsmParser &parser, llvm::SMLoc sm1, llvm::SMLoc sm2) {
+[[maybe_unused]] static llvm::SMLoc minSMLoc(AsmParser &parser, llvm::SMLoc sm1,
+                                             llvm::SMLoc sm2) {
   const auto loc1 = dyn_cast<FileLineColLoc>(parser.getEncodedSourceLoc(sm1));
   assert(loc1 && "Could not get `FileLineColLoc` for first `SMLoc`");
   const auto loc2 = dyn_cast<FileLineColLoc>(parser.getEncodedSourceLoc(sm2));
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index f539502..684c088 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -43,8 +43,8 @@ using namespace mlir::sparse_tensor;
 //===----------------------------------------------------------------------===//
 
 #ifndef NDEBUG
-LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder,
-                                                  Location loc, Value memref) {
+[[maybe_unused]] static void dumpIndexMemRef(OpBuilder &builder, Location loc,
+                                             Value memref) {
   memref = memref::CastOp::create(
       builder, loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
   createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
diff --git a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
index 9a24c2b..a2cff6a 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
@@ -21,10 +21,10 @@ using namespace mlir;
 
 // These are automatically generated by ODS but are not used as the Transform
 // dialect uses a different dispatch mechanism to support dialect extensions.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/Transform/IR/TransformTypes.cpp.inc"
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 9beb22d..1599ae9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -727,6 +727,152 @@ void MemLayoutAttr::print(AsmPrinter &printer) const {
   }
   printer << ">";
 }
+// a helper utility to perform binary operation on OpFoldResult.
+// If both a and b are attributes, it will simply return the result.
+// Otherwise, the corresponding arith op will be generated, and an
+// contant op will be created if one of them is an attribute.
+template <typename ArithOp>
+OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc,
+                      OpBuilder &builder) {
+  auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a);
+  auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b);
+  return builder.create<ArithOp>(loc, aVal, bVal).getResult();
+}
+
+// a helper utility to perform division operation on OpFoldResult and int64_t.
+#define div(a, b)                                                              \
+  genBinOp<arith::DivSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform reminder operation on OpFoldResult and int64_t.
+#define rem(a, b)                                                              \
+  genBinOp<arith::RemSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform multiply operation on OpFoldResult and int64_t.
+#define mul(a, b)                                                              \
+  genBinOp<arith::MulIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform addition operation on two OpFoldResult.
+#define add(a, b) genBinOp<arith::AddIOp>(a, b, loc, builder)
+
+// block the given offsets according to the block shape
+// say the original offset is [y, x], and the block shape is [By, Bx],
+// then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+SmallVector<OpFoldResult> getBlockedOffsets(OpBuilder &builder, Location loc,
+                                            ArrayRef<OpFoldResult> offsets,
+                                            ArrayRef<int64_t> blockShape) {
+
+  assert(offsets.size() == blockShape.size() &&
+         "offsets and blockShape must have the same size");
+  SmallVector<OpFoldResult> blockedOffsets;
+  SmallVector<OpFoldResult> divs, rems;
+
+  for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+    divs.push_back(div(offset, block));
+    rems.push_back(rem(offset, block));
+  }
+  blockedOffsets.append(divs.begin(), divs.end());
+  blockedOffsets.append(rems.begin(), rems.end());
+
+  return blockedOffsets;
+}
+
+// Get strides as vector of integer for MemDesc.
+SmallVector<int64_t> MemDescType::getStrideShape() {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+
+  ArrayAttr strideAttr = getStrideAttr();
+  SmallVector<int64_t> strides;
+  for (Attribute attr : strideAttr.getValue()) {
+    strides.push_back(cast<IntegerAttr>(attr).getInt());
+  }
+
+  SmallVector<int64_t> innerBlkShape = getBlockShape();
+
+  // get perm from FCD to LCD
+  // perm[i] = the dim with i-th smallest stride
+  SmallVector<int, 4> perm =
+      llvm::to_vector<4>(llvm::seq<int>(0, strides.size()));
+  llvm::sort(perm, [&](int a, int b) { return strides[a] < strides[b]; });
+
+  assert(strides[perm[0]] == 1 && "inner most dim must have stride 1");
+
+  SmallVector<int64_t> innerBlkStride(innerBlkShape.size());
+  innerBlkStride[perm[0]] = 1;
+  for (size_t i = 1; i < perm.size(); ++i)
+    innerBlkStride[perm[i]] =
+        innerBlkStride[perm[i - 1]] * innerBlkShape[perm[i - 1]];
+
+  // compute the original matrix shape using the stride info
+  // and compute the number of blocks in each dimension
+  // The shape of highest dim can't be derived from stride info,
+  // but doesn't impact the stride computation for blocked layout.
+  SmallVector<int64_t> matrixShapeOrig(matrixShape.size());
+  SmallVector<int64_t> BlkShapeOrig(matrixShape.size());
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    matrixShapeOrig[perm[i]] = strides[perm[i + 1]] / strides[perm[i]];
+    BlkShapeOrig[perm[i]] = matrixShapeOrig[perm[i]] / innerBlkShape[perm[i]];
+  }
+
+  int64_t innerBlkSize = 1;
+  for (auto s : innerBlkShape)
+    innerBlkSize *= s;
+
+  SmallVector<int64_t> outerBlkStride(matrixShape.size());
+  outerBlkStride[perm[0]] = innerBlkSize;
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    outerBlkStride[perm[i + 1]] =
+        outerBlkStride[perm[i]] * BlkShapeOrig[perm[i]];
+  }
+
+  // combine the inner and outer strides
+  SmallVector<int64_t> blockedStrides;
+  blockedStrides.append(outerBlkStride.begin(), outerBlkStride.end());
+  blockedStrides.append(innerBlkStride.begin(), innerBlkStride.end());
+
+  return blockedStrides;
+}
+
+// Calculate the linear offset using the blocked offsets and stride
+Value MemDescType::getLinearOffsets(OpBuilder &builder, Location loc,
+                                    ArrayRef<OpFoldResult> offsets) {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+  SmallVector<int64_t> blockShape = getBlockShape();
+  SmallVector<int64_t> strides = getStrideShape();
+  SmallVector<OpFoldResult> blockedOffsets;
+
+  // blockshape equal to matrixshape means no blocking
+  if (llvm::equal(blockShape, matrixShape)) {
+    // remove the outer dims from strides
+    strides.erase(strides.begin(), strides.begin() + matrixShape.size());
+  } else {
+    assert(offsets.size() == blockShape.size() &&
+           "offsets and blockShape must have the same size");
+    // say the original offset is [y, x], and the block shape is [By, Bx],
+    // then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+
+    SmallVector<OpFoldResult> divs, rems;
+
+    for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+      divs.push_back(div(offset, block));
+      rems.push_back(rem(offset, block));
+    }
+    blockedOffsets.append(divs.begin(), divs.end());
+    blockedOffsets.append(rems.begin(), rems.end());
+    offsets = blockedOffsets;
+  }
+
+  // Start with initial value as matrix descriptor's base offset.
+  Value linearOffset = arith::ConstantIndexOp::create(builder, loc, 0);
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    OpFoldResult mulResult = mul(offsets[i], strides[i]);
+    Value mulVal = getValueOrCreateConstantIndexOp(builder, loc, mulResult);
+    linearOffset = arith::AddIOp::create(builder, loc, mulVal, linearOffset);
+  }
+
+  return linearOffset;
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index e0a8ac4..abd12e2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -173,6 +173,49 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
   return success();
 }
 
+LogicalResult
+IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
+                      UnitAttr subgroup_block_io,
+                      function_ref<InFlightDiagnostic()> emitError) {
+
+  if (!dataTy) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    else
+      return success();
+  }
+
+  if (mdescTy.getRank() != 2)
+    return emitError() << "mem_desc must be 2D.";
+
+  ArrayRef<int64_t> dataShape = dataTy.getShape();
+  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
+
+  if (dataShape.size() == 2) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
+                     [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+      return emitError() << "data shape must not exceed mem_desc shape.";
+  } else {
+    SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
+    // if the subgroup_block_io attribute is set,  mdescTy must have block
+    // attribute
+    if (subgroup_block_io && !blockShape.size())
+      return emitError() << "mem_desc must have block attribute when "
+                            "subgroup_block_io is set.";
+    // if the subgroup_block_io attribute is set, the memdesc should be row
+    // major
+    if (subgroup_block_io && mdescTy.isColMajor())
+      return emitError() << "mem_desc should be row major when "
+                            "subgroup_block_io is set.";
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -1049,23 +1092,20 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  // Call the generated builder with all parameters (including optional ones as
+  // nullptr/empty)
   build(builder, state, res, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult LoadMatrixOp::verify() {
-  VectorType resTy = getRes().getType();
-  MemDescType mdescTy = getMemDesc().getType();
 
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
+  auto resTy = dyn_cast<VectorType>(getRes().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
 
-  ArrayRef<int64_t> valueShape = resTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed mem_desc shape.");
-  return success();
+  return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1080,57 +1120,16 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
   build(builder, state, data, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult StoreMatrixOp::verify() {
-  VectorType dataTy = getData().getType();
-  MemDescType mdescTy = getMemDesc().getType();
-
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
-
-  ArrayRef<int64_t> dataShape = dataTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("data shape must not exceed mem_desc shape.");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// XeGPU_MemDescSubviewOp
-//===----------------------------------------------------------------------===//
-
-void MemDescSubviewOp::build(OpBuilder &builder, OperationState &state,
-                             Type resTy, Value src,
-                             llvm::ArrayRef<OpFoldResult> offsets) {
-  llvm::SmallVector<Value> dynamicOffsets;
-  llvm::SmallVector<int64_t> staticOffsets;
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr);
-}
-
-LogicalResult MemDescSubviewOp::verify() {
-  MemDescType srcTy = getSrc().getType();
-  MemDescType resTy = getRes().getType();
-  ArrayRef<int64_t> srcShape = srcTy.getShape();
-  ArrayRef<int64_t> resShape = resTy.getShape();
 
-  if (srcTy.getRank() < resTy.getRank())
-    return emitOpError("result rank must not exceed source rank.");
-
-  if (llvm::any_of(
-          llvm::zip_equal(resShape, srcShape.take_back(resShape.size())),
-          [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed source shape.");
-
-  if (srcTy.getStrides() != resTy.getStrides())
-    return emitOpError("result must inherit the source strides.");
-
-  return success();
+  auto dataTy = dyn_cast<VectorType>(getData().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
+  return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index a178d0f..aafa1b7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -941,7 +941,9 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
   LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    VectorType valueTy = op.getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType());
+    assert(valueTy && "the value type must be vector type!");
+
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
       return failure();
@@ -984,7 +986,8 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
       return failure();
 
     Location loc = op.getLoc();
-    VectorType valueTy = op.getData().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType());
+    assert(valueTy && "the value type must be vector type!");
     ArrayRef<int64_t> shape = valueTy.getShape();
     auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index c28d2fc..31a967d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -991,7 +991,8 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> {
       return failure();
 
     ArrayRef<int64_t> wgShape = op.getDataShape();
-    VectorType valueTy = op.getRes().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getRes().getType());
+    assert(valueTy && "the value type must be vector type!");
     Type elemTy = valueTy.getElementType();
 
     xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 89b81cf..5f63fe6 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -1204,7 +1204,7 @@ AffineMap AffineMap::getImpl(unsigned dimCount, unsigned symbolCount,
 /// present in result expressions is less than `dimCount` and the highest index
 /// of symbolic identifier present in result expressions is less than
 /// `symbolCount`.
-LLVM_ATTRIBUTE_UNUSED static bool
+[[maybe_unused]] static bool
 willBeValidAffineMap(unsigned dimCount, unsigned symbolCount,
                      ArrayRef<AffineExpr> results) {
   int64_t maxDimPosition = -1;
diff --git a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
index 9670285..3fda5a7 100644
--- a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
@@ -93,7 +93,7 @@ void CodeGen::generate(const ast::Module &astModule, ModuleOp module) {
 
   // Emit function to add the generated matchers to the pattern list.
   os << "template <typename... ConfigsT>\n"
-        "static void LLVM_ATTRIBUTE_UNUSED populateGeneratedPDLLPatterns("
+        "[[maybe_unused]] static void populateGeneratedPDLLPatterns("
         "::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) {\n";
   for (const auto &name : patternNames)
     os << "  patterns.add<" << name
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 9f5246d..ffa96ad 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -137,6 +137,16 @@ declare_mlir_dialect_python_bindings(
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/OpenACCOps.td
+  SOURCES
+    dialects/openacc.py
+  DIALECT_NAME acc
+  DEPENDS acc_common_td
+  )
+
+declare_mlir_dialect_python_bindings(
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
   TD_FILE dialects/GPUOps.td
   SOURCES_GLOB dialects/gpu/*.py
   DIALECT_NAME gpu
diff --git a/mlir/python/mlir/dialects/OpenACCOps.td b/mlir/python/mlir/dialects/OpenACCOps.td
new file mode 100644
index 0000000..69a3002
--- /dev/null
+++ b/mlir/python/mlir/dialects/OpenACCOps.td
@@ -0,0 +1,14 @@
+//===-- OpenACCOps.td - Entry point for OpenACCOps bind ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PYTHON_BINDINGS_OPENACC_OPS
+#define PYTHON_BINDINGS_OPENACC_OPS
+
+include "mlir/Dialect/OpenACC/OpenACCOps.td"
+
+#endif
diff --git a/mlir/python/mlir/dialects/gpu/__init__.py b/mlir/python/mlir/dialects/gpu/__init__.py
index 4cd80aa..b14ea68 100644
--- a/mlir/python/mlir/dialects/gpu/__init__.py
+++ b/mlir/python/mlir/dialects/gpu/__init__.py
@@ -3,5 +3,151 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from .._gpu_ops_gen import *
+from .._gpu_ops_gen import _Dialect
 from .._gpu_enum_gen import *
 from ..._mlir_libs._mlirDialectsGPU import *
+from typing import Callable, Sequence, Union, Optional, List
+
+try:
+    from ...ir import (
+        FunctionType,
+        TypeAttr,
+        StringAttr,
+        UnitAttr,
+        Block,
+        InsertionPoint,
+        ArrayAttr,
+        Type,
+        DictAttr,
+        Attribute,
+        DenseI32ArrayAttr,
+    )
+    from .._ods_common import (
+        get_default_loc_context as _get_default_loc_context,
+        _cext as _ods_cext,
+    )
+except ImportError as e:
+    raise RuntimeError("Error loading imports from extension module") from e
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class GPUFuncOp(GPUFuncOp):
+    __doc__ = GPUFuncOp.__doc__
+
+    KERNEL_ATTR_NAME = "gpu.kernel"
+    KNOWN_BLOCK_SIZE_ATTR_NAME = "known_block_size"
+    KNOWN_GRID_SIZE_ATTR_NAME = "known_grid_size"
+
+    FUNCTION_TYPE_ATTR_NAME = "function_type"
+    SYM_NAME_ATTR_NAME = "sym_name"
+    ARGUMENT_ATTR_NAME = "arg_attrs"
+    RESULT_ATTR_NAME = "res_attrs"
+
+    def __init__(
+        self,
+        function_type: Union[FunctionType, TypeAttr],
+        sym_name: Optional[Union[str, StringAttr]] = None,
+        kernel: Optional[bool] = None,
+        workgroup_attrib_attrs: Optional[Sequence[dict]] = None,
+        private_attrib_attrs: Optional[Sequence[dict]] = None,
+        known_block_size: Optional[Union[Sequence[int], DenseI32ArrayAttr]] = None,
+        known_grid_size: Optional[Union[Sequence[int], DenseI32ArrayAttr]] = None,
+        loc=None,
+        ip=None,
+        body_builder: Optional[Callable[[GPUFuncOp], None]] = None,
+    ):
+        """
+        Create a GPUFuncOp with the provided `function_type`, `sym_name`,
+        `kernel`, `workgroup_attrib_attrs`, `private_attrib_attrs`, `known_block_size`,
+        `known_grid_size`, and `body_builder`.
+        - `function_type` is a FunctionType or a TypeAttr.
+        - `sym_name` is a string or a StringAttr representing the function name.
+        - `kernel` is a boolean representing whether the function is a kernel.
+        - `workgroup_attrib_attrs` is an optional list of dictionaries.
+        - `private_attrib_attrs` is an optional list of dictionaries.
+        - `known_block_size` is an optional list of integers or a DenseI32ArrayAttr representing the known block size.
+        - `known_grid_size` is an optional list of integers or a DenseI32ArrayAttr representing the known grid size.
+        - `body_builder` is an optional callback. When provided, a new entry block
+          is created and the callback is invoked with the new op as argument within
+          an InsertionPoint context already set for the block. The callback is
+          expected to insert a terminator in the block.
+        """
+        function_type = (
+            TypeAttr.get(function_type)
+            if not isinstance(function_type, TypeAttr)
+            else function_type
+        )
+        super().__init__(
+            function_type,
+            workgroup_attrib_attrs=workgroup_attrib_attrs,
+            private_attrib_attrs=private_attrib_attrs,
+            loc=loc,
+            ip=ip,
+        )
+
+        if isinstance(sym_name, str):
+            self.attributes[self.SYM_NAME_ATTR_NAME] = StringAttr.get(sym_name)
+        elif isinstance(sym_name, StringAttr):
+            self.attributes[self.SYM_NAME_ATTR_NAME] = sym_name
+        else:
+            raise ValueError("sym_name must be a string or a StringAttr")
+
+        if kernel:
+            self.attributes[self.KERNEL_ATTR_NAME] = UnitAttr.get()
+
+        if known_block_size is not None:
+            if isinstance(known_block_size, Sequence):
+                block_size = DenseI32ArrayAttr.get(known_block_size)
+                self.attributes[self.KNOWN_BLOCK_SIZE_ATTR_NAME] = block_size
+            elif isinstance(known_block_size, DenseI32ArrayAttr):
+                self.attributes[self.KNOWN_BLOCK_SIZE_ATTR_NAME] = known_block_size
+            else:
+                raise ValueError(
+                    "known_block_size must be a list of integers or a DenseI32ArrayAttr"
+                )
+
+        if known_grid_size is not None:
+            if isinstance(known_grid_size, Sequence):
+                grid_size = DenseI32ArrayAttr.get(known_grid_size)
+                self.attributes[self.KNOWN_GRID_SIZE_ATTR_NAME] = grid_size
+            elif isinstance(known_grid_size, DenseI32ArrayAttr):
+                self.attributes[self.KNOWN_GRID_SIZE_ATTR_NAME] = known_grid_size
+            else:
+                raise ValueError(
+                    "known_grid_size must be a list of integers or a DenseI32ArrayAttr"
+                )
+
+        if body_builder is not None:
+            with InsertionPoint(self.add_entry_block()):
+                body_builder(self)
+
+    @property
+    def name(self) -> StringAttr:
+        return StringAttr(self.attributes[self.SYM_NAME_ATTR_NAME])
+
+    @property
+    def is_kernel(self) -> bool:
+        return self.KERNEL_ATTR_NAME in self.attributes
+
+    def add_entry_block(self) -> Block:
+        if len(self.body.blocks) > 0:
+            raise RuntimeError(f"Entry block already exists for {self.name.value}")
+
+        function_type = self.function_type.value
+        return self.body.blocks.append(
+            *function_type.inputs,
+            arg_locs=[self.location for _ in function_type.inputs],
+        )
+
+    @property
+    def entry_block(self) -> Block:
+        if len(self.body.blocks) == 0:
+            raise RuntimeError(
+                f"Entry block does not exist for {self.name.value}."
+                + " Do you need to call the add_entry_block() method on this GPUFuncOp?"
+            )
+        return self.body.blocks[0]
+
+    @property
+    def arguments(self) -> Sequence[Type]:
+        return self.function_type.value.inputs
diff --git a/mlir/python/mlir/dialects/openacc.py b/mlir/python/mlir/dialects/openacc.py
new file mode 100644
index 0000000..057f71a
--- /dev/null
+++ b/mlir/python/mlir/dialects/openacc.py
@@ -0,0 +1,5 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from ._acc_ops_gen import *
diff --git a/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir b/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir
index e6f22f0..a9ab0be 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir
@@ -1,17 +1,13 @@
 // RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
 
-#sg_map_a_f16 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-#sg_map_b_f16 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
-#sg_map_c_f32 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-
-gpu.module @load_store_check {
+gpu.module @test_kernel {
     // CHECK-LABEL: func.func @dpas(
     // CHECK-SAME: %[[ARG0:.*]]: vector<8xf16>, %[[ARG1:.*]]: vector<16xf16>, %[[ARG2:.*]]: vector<8xf32>
     func.func @dpas(%a_loaded: vector<8xf16>, %b_loaded: vector<16xf16>, %c_loaded: vector<8xf32>) -> vector<8xf32> {
         // Loads are checked in a separate test.
         // CHECK: %[[D:.*]] = xevm.mma %[[ARG0]], %[[ARG1]], %[[ARG2]] {shape = <m = 8, n = 16, k = 16>, types = <d = f32, a = f16, b = f16, c = f32>}
         // CHECK-SAME:    : (vector<8xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
-        %d = xegpu.dpas %a_loaded, %b_loaded, %c_loaded {a_layout = #sg_map_a_f16, b_layout = #sg_map_b_f16, c_layout = #sg_map_c_f32}
+        %d = xegpu.dpas %a_loaded, %b_loaded, %c_loaded
             : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
         return %d : vector<8xf32>
     }
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir
new file mode 100644
index 0000000..d4cb493
--- /dev/null
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir
@@ -0,0 +1,201 @@
+// RUN: mlir-opt  -split-input-file -convert-xegpu-to-xevm -cse %s | FileCheck %s
+
+gpu.module @test_kernel [#xevm.target<chip = "pvc">] {
+
+ // e.g. for mem_desc<32x32xf16, @strides=[1, 16]>
+  // its memory layout tuple is (blocked shape = [1,1,32,32],strides=[1024,1024,32,1])
+  //CHECK-LABEL: load_store_matrix_1
+  gpu.func @load_store_matrix_1(%arg0: memref<4096xi8, 3>) -> f32 {
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
+
+    //CHECK: %[[TID:.*]] = gpu.thread_id x
+    //CHECK: %[[C1:.*]] = arith.constant 1 : index
+    //CHECK: %[[MUL1:.*]] = arith.muli %[[TID]], %[[C1]] : index
+    //CHECK: %[[C4:.*]] = arith.constant 4 : i32
+    //CHECK: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4]] : i32
+    //CHECK: llvm.load {{.*}} : !llvm.ptr<3> -> f32
+
+    %tid_x = gpu.thread_id x
+    %c0 = arith.constant 0 : index
+    %1 = xegpu.load_matrix %0[%c0, %tid_x]: !xegpu.mem_desc<32x32xf32>, index, index -> f32
+
+    //CHECK: llvm.store {{.*}}, {{.*}} : f32, !llvm.ptr<3>
+
+     xegpu.store_matrix %1, %0[%c0, %tid_x]: f32, !xegpu.mem_desc<32x32xf32>, index, index
+
+    gpu.return %1: f32
+  }
+
+// e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 32]>
+  // its memory layout tuple is ([2,4,16,16],[256,512,1,16])
+  //CHECK-LABEL: load_store_matrix_2
+  gpu.func @load_store_matrix_2(%arg0: memref<4096xi8, 3>) -> f16 {
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[tid_x:.*]] = gpu.thread_id x
+    //CHECK: %[[c13:.*]] = arith.constant 13 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c13]], %[[c16]] : index
+    //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c13]], %[[c16]] : index
+    //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index
+
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c512:.*]] = arith.constant 512 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c512]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c1]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c16]] : index
+    //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+
+    //CHECK: %[[loaded:.*]] = llvm.load {{.*}}: !llvm.ptr<3> -> f16
+ 
+
+    %tid_x = gpu.thread_id x
+    %c13 = arith.constant 13 : index
+    %1 = xegpu.load_matrix %0[%c13, %tid_x]: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index -> f16
+
+    //CHECK: llvm.store %[[loaded]], {{.*}} : f16, !llvm.ptr<3>
+   
+    xegpu.store_matrix %1, %0[%c13, %tid_x]: f16, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index 
+    gpu.return %1: f16
+  }
+
+
+  // e.g. for mem_desc<32x64xf16, @block=[16, 16]>
+  // its memory layout tuple is ([2,4,16,16],[1024,256,16,1])
+  //CHECK-LABEL: load_store_matrix_3
+  gpu.func @load_store_matrix_3(%arg0: memref<4096xi8, 3>) -> f16 {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3>
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>
+    
+    //CHECK: %[[tid_x:.*]] = gpu.thread_id x
+    //CHECK: %[[c19:.*]] = arith.constant 19 : index
+    %tid_x = gpu.thread_id x
+    %c19 = arith.constant 19: index
+    
+    //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index
+    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c19]], %[[c16]] : index
+    //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c19]], %[[c16]] : index
+    //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[c1024:.*]] = arith.constant 1024 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c1024]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c256]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c16]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c1]] : index
+    //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+
+    //CHECK: %[[loaded:.*]] = llvm.load {{.*}} : !llvm.ptr<3> -> f16
+    %1 = xegpu.load_matrix %0[%c19, %tid_x]: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index -> f16
+    
+    //CHECK: llvm.store %[[loaded]], {{.*}} : f16, !llvm.ptr<3>
+    xegpu.store_matrix %1, %0[%c19, %tid_x]:  f16, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index
+    
+    //CHECK: gpu.return %[[loaded]] : f16
+    gpu.return %1: f16
+  }
+
+   // e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 16]>
+  // its memory layout tuple is ([2,4,16,16],[256,512,1,16])
+  //CHECK-LABEL: load_store_matrix_4
+  gpu.func @load_store_matrix_4(%arg0: memref<4096xi8, 3>) -> vector<8xf16> {
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>
+
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[tid_x:.*]] = gpu.thread_id x
+
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index
+
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c512:.*]] = arith.constant 512 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c512]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c1]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c16]] : index
+    //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+
+    //CHECK: %[[loaded:.*]] = llvm.load {{.*}}: !llvm.ptr<3> -> vector<8xf16>
+     
+    %tid_x = gpu.thread_id x
+    %c16 = arith.constant 16 : index
+    %1 = xegpu.load_matrix %0[%c16, %tid_x] : !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index -> vector<8xf16>
+
+    //CHECK: llvm.store %[[loaded]], {{.*}} : vector<8xf16>, !llvm.ptr<3>
+    xegpu.store_matrix %1, %0[%c16, %tid_x] : vector<8xf16>, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index
+
+    gpu.return %1: vector<8xf16>
+  }
+
+ 
+  // e.g. for mem_desc<32x64xf16, @block=[16, 16]>
+  // its memory layout tuple is ([2,4,16,16],[1024,256,16,1])
+  //CHECK-LABEL: load_store_matrix_5
+  gpu.func @load_store_matrix_5(%arg0: memref<4096xi8, 3>) -> vector<8xf16> {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3>
+ 
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>
+ 
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[c48:.*]] = arith.constant 48 : index
+  
+    %c16 = arith.constant 16 : index
+    %c48 = arith.constant 48 : index
+
+    //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index
+    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32
+    //CHECK: %[[offset0:.*]] = arith.divsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offset1:.*]] = arith.remsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offset2:.*]] = arith.divsi %[[c48]], %[[c16]] : index
+    //CHECK: %[[offset3:.*]] = arith.remsi %[[c48]], %[[c16]] : index
+    //CHECK: %[[c1024:.*]] = arith.constant 1024 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offset0]], %[[c1024]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offset2]], %[[c256]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offset1]], %[[c16]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offset3]], %[[c1]] : index
+    //CHECK: %[[linearOffset:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+    //CHECK: %[[linearOffsetI64:.*]] = arith.index_castui %[[linearOffset]] : index to i32
+    //CHECK: %[[c2:.*]] = arith.constant 2 : i32
+    //CHECK: %[[byteOffset:.*]] = arith.muli %[[linearOffsetI64]], %[[c2]] : i32
+    //CHECK: %[[finalPtr:.*]] = arith.addi %[[basePtrI64]], %[[byteOffset]] : i32
+    //CHECK: %[[ptr:.*]] = llvm.inttoptr %[[finalPtr]] : i32 to !llvm.ptr<3>
+    //CHECK: %[[loadedI16:.*]] = xevm.blockload %[[ptr]] : (!llvm.ptr<3>) -> vector<8xi16>
+    //CHECK: %[[loaded:.*]] = vector.bitcast %[[loadedI16]] : vector<8xi16> to vector<8xf16>
+
+    %1 = xegpu.load_matrix %0[%c16, %c48] {subgroup_block_io}: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index -> vector<8xf16>
+
+    //CHECK: %[[storeDataI16:.*]] = vector.bitcast %[[loaded]] : vector<8xf16> to vector<8xi16>
+    //CHECK: xevm.blockstore %[[ptr]], %[[storeDataI16]] : (!llvm.ptr<3>, vector<8xi16>) 
+
+    xegpu.store_matrix %1, %0[%c16, %c48] {subgroup_block_io}: vector<8xf16>, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index
+
+    gpu.return %1: vector<8xf16>
+  }
+
+}
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
index e56079c..1169cd1 100644
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -2235,6 +2235,136 @@ func.func @affine_leading_zero_no_outer_bound(%arg0: index, %arg1: index) -> ind
 
 // -----
 
+// CHECK-LABEL: func @delin_apply_cancel_exact
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK-COUNT-6: memref.store %[[ARG0]], %[[ARG1]][%[[ARG0]]]
+// CHECK-NOT: memref.store
+// CHECK: return
+func.func @delin_apply_cancel_exact(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:3 = affine.delinearize_index %arg0 into (4, 5) : index, index, index
+  %b:3 = affine.delinearize_index %arg0 into (3, 4, 5) : index, index, index
+  %c:2 = affine.delinearize_index %arg0 into (20) : index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 5 + s2 * 20)>()[%a#2, %a#1, %a#0]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  %t2 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s2 * 20 + s1 * 5)>()[%a#2, %a#1, %a#0]
+  memref.store %t2, %arg1[%t2] : memref<?xindex>
+
+  %t3 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 20 + s2 * 5 + s0)>()[%a#2, %a#0, %a#1]
+  memref.store %t3, %arg1[%t3] : memref<?xindex>
+
+  %t4 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 5 + s2 * 20)>()[%b#2, %b#1, %b#0]
+  memref.store %t4, %arg1[%t4] : memref<?xindex>
+
+  %t5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 20)>()[%c#1, %c#0]
+  memref.store %t5, %arg1[%t5] : memref<?xindex>
+
+  %t6 = affine.apply affine_map<()[s0, s1] -> (s1 * 20 + s0)>()[%c#1, %c#0]
+  memref.store %t6, %arg1[%t5] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @delin_apply_cancel_exact_dim
+// CHECK: affine.for %[[arg1:.+]] = 0 to 256
+// CHECK: memref.store %[[arg1]]
+// CHECK: return
+func.func @delin_apply_cancel_exact_dim(%arg0: memref<?xindex>) {
+  affine.for %arg1 = 0 to 256 {
+    %a:3 = affine.delinearize_index %arg1 into (2, 2, 64) : index, index, index
+    %i = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 128 + d2 * 64)>(%a#2, %a#0, %a#1)
+    memref.store %i, %arg0[%i] : memref<?xindex>
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + 512)>
+// CHECK-LABEL: func @delin_apply_cancel_const_term
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: affine.apply #[[$MAP]]()[%[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_const_term(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:3 = affine.delinearize_index %arg0 into (2, 2, 64) : index, index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 64 + 512)>()[%a#2, %a#0, %a#1]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 512)>
+// CHECK-LABEL: func @delin_apply_cancel_var_term
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>, %[[ARG2:.+]]: index)
+// CHECK: affine.apply #[[$MAP]]()[%[[ARG2]], %[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_var_term(%arg0:  index, %arg1: memref<?xindex>, %arg2: index) {
+  %a:3 = affine.delinearize_index %arg0 into (2, 2, 64) : index, index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s1 * 128 + s2 * 64 + s3 + 512)>()[%a#2, %a#0, %a#1, %arg2]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2 + s0 ceildiv 4)>
+// CHECK-LABEL: func @delin_apply_cancel_nested_exprs
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: affine.apply #[[$MAP]]()[%[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_nested_exprs(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:2 = affine.delinearize_index %arg0 into (20) : index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1] -> ((s0 + s1 * 20) ceildiv 4 + (s1 * 20 + s0) * 2)>()[%a#1, %a#0]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @delin_apply_cancel_preserve_rotation
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: %[[A:.+]]:2 = affine.delinearize_index %[[ARG0]] into (20)
+// CHECK: affine.apply #[[$MAP]]()[%[[A]]#1, %[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_preserve_rotation(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:2 = affine.delinearize_index %arg0 into (20) : index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 20 + s0)>()[%a#1, %a#0]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 5)>
+// CHECK-LABEL: func @delin_apply_dont_cancel_partial
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: %[[A:.+]]:3 = affine.delinearize_index %[[ARG0]] into (3, 4, 5)
+// CHECK: affine.apply #[[$MAP]]()[%[[A]]#2, %[[A]]#1]
+// CHECK: return
+func.func @delin_apply_dont_cancel_partial(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:3 = affine.delinearize_index %arg0 into (3, 4, 5) : index, index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 5)>()[%a#2, %a#1]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @cst_value_to_cst_attr_basis_delinearize_index
 // CHECK-SAME:    (%[[ARG0:.*]]: index)
 // CHECK:         %[[RET:.*]]:3 = affine.delinearize_index %[[ARG0]] into (3, 4, 2) : index, index
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
index 603ace8..3d4bec7 100644
--- a/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
@@ -3,7 +3,7 @@
 func.func @test_static_memref_alloc() {
   %0 = memref.alloca() {test.ptr} : memref<10x20xf32>
   // CHECK: Successfully generated alloc for operation: %[[ORIG:.*]] = memref.alloca() {test.ptr} : memref<10x20xf32>
-  // CHECK: Generated: %{{.*}} = memref.alloca() : memref<10x20xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloca() {acc.var_name = #acc.var_name<"test_alloc">} : memref<10x20xf32>
   return
 }
 
@@ -19,6 +19,6 @@ func.func @test_dynamic_memref_alloc() {
   // CHECK: Generated: %[[DIM0:.*]] = memref.dim %[[ORIG]], %[[C0]] : memref<?x?xf32>
   // CHECK: Generated: %[[C1:.*]] = arith.constant 1 : index
   // CHECK: Generated: %[[DIM1:.*]] = memref.dim %[[ORIG]], %[[C1]] : memref<?x?xf32>
-  // CHECK: Generated: %{{.*}} = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloc(%[[DIM0]], %[[DIM1]]) {acc.var_name = #acc.var_name<"test_alloc">} : memref<?x?xf32>
   return
 }
diff --git a/mlir/test/Dialect/OpenACC/recipe-populate-firstprivate.mlir b/mlir/test/Dialect/OpenACC/recipe-populate-firstprivate.mlir
index 35355c6..8846c9e 100644
--- a/mlir/test/Dialect/OpenACC/recipe-populate-firstprivate.mlir
+++ b/mlir/test/Dialect/OpenACC/recipe-populate-firstprivate.mlir
@@ -2,7 +2,7 @@
 
 // CHECK: acc.firstprivate.recipe @firstprivate_scalar : memref<f32> init {
 // CHECK: ^bb0(%{{.*}}: memref<f32>):
-// CHECK:   %[[ALLOC:.*]] = memref.alloca() : memref<f32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar">} : memref<f32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<f32>
 // CHECK: } copy {
 // CHECK: ^bb0(%[[SRC:.*]]: memref<f32>, %[[DST:.*]]: memref<f32>):
@@ -20,7 +20,7 @@ func.func @test_scalar() {
 
 // CHECK: acc.firstprivate.recipe @firstprivate_static_2d : memref<10x20xf32> init {
 // CHECK: ^bb0(%{{.*}}: memref<10x20xf32>):
-// CHECK:   %[[ALLOC:.*]] = memref.alloca() : memref<10x20xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"static_2d">} : memref<10x20xf32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<10x20xf32>
 // CHECK: } copy {
 // CHECK: ^bb0(%[[SRC:.*]]: memref<10x20xf32>, %[[DST:.*]]: memref<10x20xf32>):
@@ -42,7 +42,7 @@ func.func @test_static_2d() {
 // CHECK:   %[[DIM0:.*]] = memref.dim %[[ARG]], %[[C0]] : memref<?x?xf32>
 // CHECK:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<?x?xf32>
-// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {acc.var_name = #acc.var_name<"dynamic_2d">} : memref<?x?xf32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<?x?xf32>
 // CHECK: } copy {
 // CHECK: ^bb0(%[[SRC:.*]]: memref<?x?xf32>, %[[DST:.*]]: memref<?x?xf32>):
@@ -65,7 +65,7 @@ func.func @test_dynamic_2d(%arg0: index, %arg1: index) {
 // CHECK: ^bb0(%[[ARG:.*]]: memref<10x?xf32>):
 // CHECK:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<10x?xf32>
-// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM1]]) : memref<10x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM1]]) {acc.var_name = #acc.var_name<"mixed_dims">} : memref<10x?xf32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<10x?xf32>
 // CHECK: } copy {
 // CHECK: ^bb0(%[[SRC:.*]]: memref<10x?xf32>, %[[DST:.*]]: memref<10x?xf32>):
@@ -86,7 +86,7 @@ func.func @test_mixed_dims(%arg0: index) {
 
 // CHECK: acc.firstprivate.recipe @firstprivate_scalar_int : memref<i32> init {
 // CHECK: ^bb0(%{{.*}}: memref<i32>):
-// CHECK:   %[[ALLOC:.*]] = memref.alloca() : memref<i32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar_int">} : memref<i32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<i32>
 // CHECK: } copy {
 // CHECK: ^bb0(%[[SRC:.*]]: memref<i32>, %[[DST:.*]]: memref<i32>):
diff --git a/mlir/test/Dialect/OpenACC/recipe-populate-private.mlir b/mlir/test/Dialect/OpenACC/recipe-populate-private.mlir
index 8403ee8..3d5a918 100644
--- a/mlir/test/Dialect/OpenACC/recipe-populate-private.mlir
+++ b/mlir/test/Dialect/OpenACC/recipe-populate-private.mlir
@@ -2,7 +2,7 @@
 
 // CHECK: acc.private.recipe @private_scalar : memref<f32> init {
 // CHECK: ^bb0(%{{.*}}: memref<f32>):
-// CHECK:   %[[ALLOC:.*]] = memref.alloca() : memref<f32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar">} : memref<f32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<f32>
 // CHECK: }
 // CHECK-NOT: destroy
@@ -16,7 +16,7 @@ func.func @test_scalar() {
 
 // CHECK: acc.private.recipe @private_static_2d : memref<10x20xf32> init {
 // CHECK: ^bb0(%{{.*}}: memref<10x20xf32>):
-// CHECK:   %[[ALLOC:.*]] = memref.alloca() : memref<10x20xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"static_2d">} : memref<10x20xf32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<10x20xf32>
 // CHECK: }
 // CHECK-NOT: destroy
@@ -34,7 +34,7 @@ func.func @test_static_2d() {
 // CHECK:   %[[DIM0:.*]] = memref.dim %[[ARG]], %[[C0]] : memref<?x?xf32>
 // CHECK:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<?x?xf32>
-// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {acc.var_name = #acc.var_name<"dynamic_2d">} : memref<?x?xf32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<?x?xf32>
 // CHECK: } destroy {
 // CHECK: ^bb0(%{{.*}}: memref<?x?xf32>, %[[VAL:.*]]: memref<?x?xf32>):
@@ -53,7 +53,7 @@ func.func @test_dynamic_2d(%arg0: index, %arg1: index) {
 // CHECK: ^bb0(%[[ARG:.*]]: memref<10x?xf32>):
 // CHECK:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<10x?xf32>
-// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM1]]) : memref<10x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM1]]) {acc.var_name = #acc.var_name<"mixed_dims">} : memref<10x?xf32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<10x?xf32>
 // CHECK: } destroy {
 // CHECK: ^bb0(%{{.*}}: memref<10x?xf32>, %[[VAL:.*]]: memref<10x?xf32>):
@@ -70,7 +70,7 @@ func.func @test_mixed_dims(%arg0: index) {
 
 // CHECK: acc.private.recipe @private_scalar_int : memref<i32> init {
 // CHECK: ^bb0(%{{.*}}: memref<i32>):
-// CHECK:   %[[ALLOC:.*]] = memref.alloca() : memref<i32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar_int">} : memref<i32>
 // CHECK:   acc.yield %[[ALLOC]] : memref<i32>
 // CHECK: }
 // CHECK-NOT: destroy
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 228ef69d..ebbe3ce 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -858,7 +858,7 @@ func.func @load_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>
 
 // -----
 func.func @load_mem_desc_invalid_result_size(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{result shape must not exceed mem_desc shape}}
+  // expected-error@+1 {{data shape must not exceed mem_desc shape}}
   %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<32x16xf16>
   return
 }
@@ -871,6 +871,14 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) {
 }
 
 // -----
+func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
+  %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16>
+  return
+}
+
+
+// -----
 func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
   // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}}
   xegpu.store_matrix %arg1, %arg0[8, 8] : vector<16x16xf32>, !xegpu.mem_desc<16x64xf16>
@@ -892,30 +900,16 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
 }
 
 // -----
-func.func @mem_desc_subview_size_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{result shape must not exceed source shape}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<32x16xf16>
-  return
-}
-
-// -----
-func.func @mem_desc_subview_layout_mismatch(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>>) {
-  // expected-error@+1 {{result must inherit the source strides}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>> -> !xegpu.mem_desc<8x16xf16>
-  return
-}
-
-// -----
-func.func @mem_desc_subview_element_type_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{failed to verify that all of {src, res} have same element type}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf32, #xegpu.mem_layout<stride =[64, 1]>>
+func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
+  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
+  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   return
 }
 
 // -----
-func.func @mem_desc_subview_rank_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{result rank must not exceed source rank}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<4x8x16xf16>
+func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
+  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
+  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index bb37902..0a10f68 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -825,53 +825,73 @@ gpu.func @create_mem_desc_with_stride() {
   gpu.return
 }
 
-// CHECK: gpu.func @load_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
-gpu.func @load_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>) {
+// CHECK: gpu.func @load_matrix([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
+gpu.func @load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) {
   // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16>
   %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
-gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+// CHECK: gpu.func @load_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @load_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
   // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
   %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
   gpu.return
 }
 
+// CHECK: gpu.func @simt_load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>)
+gpu.func @simt_load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 16] : !xegpu.mem_desc<16x64xf16> -> vector<1xf16>
+  %data = xegpu.load_matrix %arg0[8, 16]: !xegpu.mem_desc<16x64xf16> -> vector<1xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @simt_load_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>)
+gpu.func @simt_load_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 16] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> -> vector<8xf16>
+  %data = xegpu.load_matrix %arg0[8, 16] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> -> vector<8xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @simt_load_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @simt_load_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8xf16> 
+  %data = xegpu.load_matrix %arg0[8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8xf16>
+  gpu.return
+}
 
-// CHECK: gpu.func @store_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
-gpu.func @store_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
+// CHECK: gpu.func @store_matrix([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
   // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @store_mem_desc_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
-gpu.func @store_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
+// CHECK: gpu.func @store_matrix_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
   // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][0, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   xegpu.store_matrix %arg1, %arg0[0, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
-// CHECK: gpu.func @mem_desc_subview([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
-gpu.func @mem_desc_subview(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+// CHECK: gpu.func @simt_store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<1xf16>) { 
+gpu.func @simt_store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<1xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 16] : vector<1xf16>, !xegpu.mem_desc<16x64xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 16]: vector<1xf16>, !xegpu.mem_desc<16x64xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @mem_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
-gpu.func @mem_desc_subview_lower_rank(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+// CHECK: gpu.func @simt_store_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>, %arg1: vector<8xf16>)
+gpu.func @simt_store_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>, %arg1: vector<8xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 16] <{subgroup_block_io}>: vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>
+  xegpu.store_matrix %arg1, %arg0[8, 16] <{subgroup_block_io}>: vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>
   gpu.return
 }
 
-// CHECK: gpu.func @mem_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
-gpu.func @mem_desc_subview_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
-  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
+// CHECK: gpu.func @simt_store_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<8xf16>) {
+gpu.func @simt_store_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<8xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> 
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
index 97fc699..496f18b 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
@@ -938,10 +938,10 @@ public:
 
 // These are automatically generated by ODS but are not used as the Transform
 // dialect uses a different dispatch mechanism to support dialect extensions.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #define GET_TYPEDEF_CLASSES
 #include "TestTransformDialectExtensionTypes.cpp.inc"
diff --git a/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll b/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll
index 4e869e5..4be30d8 100644
--- a/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll
+++ b/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll
@@ -28,7 +28,7 @@
 // CHECK:      operation "test.op3"
 // CHECK:  )mlir", context), std::forward<ConfigsT>(configs)...)
 
-// CHECK:      static void LLVM_ATTRIBUTE_UNUSED populateGeneratedPDLLPatterns(::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) {
+// CHECK{LITERAL}: [[maybe_unused]] static void populateGeneratedPDLLPatterns(::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) {
 // CHECK-NEXT:   patterns.add<GeneratedPDLLPattern0>(patterns.getContext(), configs...);
 // CHECK-NEXT:   patterns.add<NamedPattern>(patterns.getContext(), configs...);
 // CHECK-NEXT:   patterns.add<GeneratedPDLLPattern1>(patterns.getContext(), configs...);
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 26ee9f3..66c4018 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -1,6 +1,7 @@
 # RUN: %PYTHON %s | FileCheck %s
 
 from mlir.ir import *
+import mlir.ir as ir
 import mlir.dialects.gpu as gpu
 import mlir.dialects.gpu.passes
 from mlir.passmanager import *
@@ -64,3 +65,95 @@ def testObjectAttr():
     # CHECK: #gpu.object<#nvvm.target, kernels = <[#gpu.kernel_metadata<"kernel", () -> ()>]>, "BC\C0\DE5\14\00\00\05\00\00\00b\0C0$MY\BEf">
     print(o)
     assert o.kernels == kernelTable
+
+
+# CHECK-LABEL: testGPUFuncOp
+@run
+def testGPUFuncOp():
+    assert gpu.GPUFuncOp.__doc__ is not None
+    module = Module.create()
+    with InsertionPoint(module.body):
+        gpu_module_name = StringAttr.get("gpu_module")
+        gpumodule = gpu.GPUModuleOp(gpu_module_name)
+        block = gpumodule.bodyRegion.blocks.append()
+
+        def builder(func: gpu.GPUFuncOp) -> None:
+            gpu.GlobalIdOp(gpu.Dimension.x)
+            gpu.ReturnOp([])
+
+        with InsertionPoint(block):
+            name = StringAttr.get("kernel0")
+            func_type = ir.FunctionType.get(inputs=[], results=[])
+            type_attr = TypeAttr.get(func_type)
+            func = gpu.GPUFuncOp(type_attr, name)
+            func.attributes["sym_name"] = name
+            func.attributes["gpu.kernel"] = UnitAttr.get()
+
+            try:
+                func.entry_block
+                assert False, "Expected RuntimeError"
+            except RuntimeError as e:
+                assert (
+                    str(e)
+                    == "Entry block does not exist for kernel0. Do you need to call the add_entry_block() method on this GPUFuncOp?"
+                )
+
+            block = func.add_entry_block()
+            with InsertionPoint(block):
+                builder(func)
+
+            try:
+                func.add_entry_block()
+                assert False, "Expected RuntimeError"
+            except RuntimeError as e:
+                assert str(e) == "Entry block already exists for kernel0"
+
+            func = gpu.GPUFuncOp(
+                func_type,
+                sym_name="kernel1",
+                kernel=True,
+                body_builder=builder,
+                known_block_size=[1, 2, 3],
+                known_grid_size=DenseI32ArrayAttr.get([4, 5, 6]),
+            )
+
+            assert func.name.value == "kernel1"
+            assert func.function_type.value == func_type
+            assert func.arg_attrs == None
+            assert func.res_attrs == None
+            assert func.arguments == []
+            assert func.entry_block == func.body.blocks[0]
+            assert func.is_kernel
+            assert func.known_block_size == DenseI32ArrayAttr.get(
+                [1, 2, 3]
+            ), func.known_block_size
+            assert func.known_grid_size == DenseI32ArrayAttr.get(
+                [4, 5, 6]
+            ), func.known_grid_size
+
+            func = gpu.GPUFuncOp(
+                func_type,
+                sym_name="non_kernel_func",
+                body_builder=builder,
+            )
+            assert not func.is_kernel
+            assert func.known_block_size is None
+            assert func.known_grid_size is None
+
+    print(module)
+
+    # CHECK: gpu.module @gpu_module
+    # CHECK: gpu.func @kernel0() kernel {
+    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
+    # CHECK:   gpu.return
+    # CHECK: }
+    # CHECK: gpu.func @kernel1() kernel attributes
+    # CHECK-SAME: known_block_size = array<i32: 1, 2, 3>
+    # CHECK-SAME: known_grid_size = array<i32: 4, 5, 6>
+    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
+    # CHECK:   gpu.return
+    # CHECK: }
+    # CHECK: gpu.func @non_kernel_func() {
+    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
+    # CHECK:   gpu.return
+    # CHECK: }
diff --git a/mlir/test/python/dialects/openacc.py b/mlir/test/python/dialects/openacc.py
new file mode 100644
index 0000000..8f2142a
--- /dev/null
+++ b/mlir/test/python/dialects/openacc.py
@@ -0,0 +1,171 @@
+# RUN: %PYTHON %s | FileCheck %s
+from unittest import result
+from mlir.ir import (
+    Context,
+    FunctionType,
+    Location,
+    Module,
+    InsertionPoint,
+    IntegerType,
+    IndexType,
+    MemRefType,
+    F32Type,
+    Block,
+    ArrayAttr,
+    Attribute,
+    UnitAttr,
+    StringAttr,
+    DenseI32ArrayAttr,
+    ShapedType,
+)
+from mlir.dialects import openacc, func, arith, memref
+from mlir.extras import types
+
+
+def run(f):
+    print("\n// TEST:", f.__name__)
+    with Context(), Location.unknown():
+        f()
+    return f
+
+
+@run
+def testParallelMemcpy():
+    module = Module.create()
+
+    dynamic = ShapedType.get_dynamic_size()
+    memref_f32_1d_any = MemRefType.get([dynamic], types.f32())
+
+    with InsertionPoint(module.body):
+        function_type = FunctionType.get(
+            [memref_f32_1d_any, memref_f32_1d_any, types.i64()], []
+        )
+        f = func.FuncOp(
+            type=function_type,
+            name="memcpy_idiom",
+        )
+        f.attributes["sym_visibility"] = StringAttr.get("public")
+
+    with InsertionPoint(f.add_entry_block()):
+        c1024 = arith.ConstantOp(types.i32(), 1024)
+        c128 = arith.ConstantOp(types.i32(), 128)
+
+        arg0, arg1, arg2 = f.arguments
+
+        copied = openacc.copyin(
+            acc_var=arg0.type,
+            var=arg0,
+            var_type=types.f32(),
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+        created = openacc.create_(
+            acc_var=arg1.type,
+            var=arg1,
+            var_type=types.f32(),
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+
+        parallel_op = openacc.ParallelOp(
+            asyncOperands=[],
+            waitOperands=[],
+            numGangs=[c1024],
+            numWorkers=[],
+            vectorLength=[c128],
+            reductionOperands=[],
+            privateOperands=[],
+            firstprivateOperands=[],
+            dataClauseOperands=[],
+        )
+
+        # Set required device_type and segment attributes to satisfy verifier
+        acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")])
+        parallel_op.numGangsDeviceType = acc_device_none
+        parallel_op.numGangsSegments = DenseI32ArrayAttr.get([1])
+        parallel_op.vectorLengthDeviceType = acc_device_none
+
+        parallel_block = Block.create_at_start(parent=parallel_op.region, arg_types=[])
+
+        with InsertionPoint(parallel_block):
+            c0 = arith.ConstantOp(types.i64(), 0)
+            c1 = arith.ConstantOp(types.i64(), 1)
+
+            loop_op = openacc.LoopOp(
+                results_=[],
+                lowerbound=[c0],
+                upperbound=[f.arguments[2]],
+                step=[c1],
+                gangOperands=[],
+                workerNumOperands=[],
+                vectorOperands=[],
+                tileOperands=[],
+                cacheOperands=[],
+                privateOperands=[],
+                reductionOperands=[],
+                firstprivateOperands=[],
+            )
+
+            # Set loop attributes: gang and independent on device_type<none>
+            acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")])
+            loop_op.gang = acc_device_none
+            loop_op.independent = acc_device_none
+
+            loop_block = Block.create_at_start(
+                parent=loop_op.region, arg_types=[types.i64()]
+            )
+
+            with InsertionPoint(loop_block):
+                idx = arith.index_cast(out=IndexType.get(), in_=loop_block.arguments[0])
+                val = memref.load(memref=copied, indices=[idx])
+                memref.store(value=val, memref=created, indices=[idx])
+                openacc.YieldOp([])
+
+            openacc.YieldOp([])
+
+        deleted = openacc.delete(
+            acc_var=copied,
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+        copied = openacc.copyout(
+            acc_var=created,
+            var=arg1,
+            var_type=types.f32(),
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK: TEST: testParallelMemcpy
+    # CHECK-LABEL:   func.func public @memcpy_idiom(
+    # CHECK-SAME:      %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: i64) {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1024 : i32
+    # CHECK:           %[[CONSTANT_1:.*]] = arith.constant 128 : i32
+    # CHECK:           %[[COPYIN_0:.*]] = acc.copyin varPtr(%[[ARG0]] : memref<?xf32>) -> memref<?xf32>
+    # CHECK:           %[[CREATE_0:.*]] = acc.create varPtr(%[[ARG1]] : memref<?xf32>) -> memref<?xf32>
+    # CHECK:           acc.parallel num_gangs({%[[CONSTANT_0]] : i32}) vector_length(%[[CONSTANT_1]] : i32) {
+    # CHECK:             %[[CONSTANT_2:.*]] = arith.constant 0 : i64
+    # CHECK:             %[[CONSTANT_3:.*]] = arith.constant 1 : i64
+    # CHECK:             acc.loop gang control(%[[VAL_0:.*]] : i64) = (%[[CONSTANT_2]] : i64) to (%[[ARG2]] : i64)  step (%[[CONSTANT_3]] : i64) {
+    # CHECK:               %[[INDEX_CAST_0:.*]] = arith.index_cast %[[VAL_0]] : i64 to index
+    # CHECK:               %[[LOAD_0:.*]] = memref.load %[[COPYIN_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32>
+    # CHECK:               memref.store %[[LOAD_0]], %[[CREATE_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32>
+    # CHECK:               acc.yield
+    # CHECK:             } attributes {independent = [#acc.device_type<none>]}
+    # CHECK:             acc.yield
+    # CHECK:           }
+    # CHECK:           acc.delete accPtr(%[[COPYIN_0]] : memref<?xf32>)
+    # CHECK:           acc.copyout accPtr(%[[CREATE_0]] : memref<?xf32>) to varPtr(%[[ARG1]] : memref<?xf32>)
+    # CHECK:           return
+    # CHECK:         }
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index cb4cfc8c..1d4ede1 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -569,12 +569,30 @@ def testOperationAttributes():
     # CHECK: Attribute value b'text'
     print(f"Attribute value {sattr.value_bytes}")
 
+    # Python dict-style iteration
     # We don't know in which order the attributes are stored.
-    # CHECK-DAG: NamedAttribute(dependent="text")
-    # CHECK-DAG: NamedAttribute(other.attribute=3.000000e+00 : f64)
-    # CHECK-DAG: NamedAttribute(some.attribute=1 : i8)
-    for attr in op.attributes:
-        print(str(attr))
+    # CHECK-DAG: dependent
+    # CHECK-DAG: other.attribute
+    # CHECK-DAG: some.attribute
+    for name in op.attributes:
+        print(name)
+
+    # Basic dict-like introspection
+    # CHECK: True
+    print("some.attribute" in op.attributes)
+    # CHECK: False
+    print("missing" in op.attributes)
+    # CHECK: Keys: ['dependent', 'other.attribute', 'some.attribute']
+    print("Keys:", sorted(op.attributes.keys()))
+    # CHECK: Values count 3
+    print("Values count", len(op.attributes.values()))
+    # CHECK: Items count 3
+    print("Items count", len(op.attributes.items()))
+
+    # Dict() conversion test
+    d = {k: v.value for k, v in dict(op.attributes).items()}
+    # CHECK: Dict mapping {'dependent': 'text', 'other.attribute': 3.0, 'some.attribute': 1}
+    print("Dict mapping", d)
 
     # Check that exceptions are raised as expected.
     try:
diff --git a/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp b/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
index 96af14d..11a2db4 100644
--- a/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
+++ b/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
@@ -416,7 +416,7 @@ static void emitOneEnumToConversion(const Record *record, raw_ostream &os) {
 
   // Emit the function converting the enum attribute to its LLVM counterpart.
   os << formatv(
-      "static LLVM_ATTRIBUTE_UNUSED {0} convert{1}ToLLVM({2}::{1} value) {{\n",
+      "[[maybe_unused]] static {0} convert{1}ToLLVM({2}::{1} value) {{\n",
       llvmClass, cppClassName, cppNamespace);
   os << "  switch (value) {\n";
 
@@ -444,7 +444,7 @@ static void emitOneCEnumToConversion(const Record *record, raw_ostream &os) {
   StringRef cppNamespace = enumAttr.getCppNamespace();
 
   // Emit the function converting the enum attribute to its LLVM counterpart.
-  os << formatv("static LLVM_ATTRIBUTE_UNUSED int64_t "
+  os << formatv("[[maybe_unused]] static int64_t "
                 "convert{0}ToLLVM({1}::{0} value) {{\n",
                 cppClassName, cppNamespace);
   os << "  switch (value) {\n";
@@ -474,7 +474,7 @@ static void emitOneEnumFromConversion(const Record *record, raw_ostream &os) {
   StringRef cppNamespace = enumInfo.getCppNamespace();
 
   // Emit the function converting the enum attribute from its LLVM counterpart.
-  os << formatv("inline LLVM_ATTRIBUTE_UNUSED {0}::{1} convert{1}FromLLVM({2} "
+  os << formatv("[[maybe_unused]] inline {0}::{1} convert{1}FromLLVM({2} "
                 "value) {{\n",
                 cppNamespace, cppClassName, llvmClass);
   os << "  switch (value) {\n";
@@ -509,10 +509,9 @@ static void emitOneCEnumFromConversion(const Record *record, raw_ostream &os) {
   StringRef cppNamespace = enumInfo.getCppNamespace();
 
   // Emit the function converting the enum attribute from its LLVM counterpart.
-  os << formatv(
-      "inline LLVM_ATTRIBUTE_UNUSED {0}::{1} convert{1}FromLLVM(int64_t "
-      "value) {{\n",
-      cppNamespace, cppClassName);
+  os << formatv("[[maybe_unused]] inline {0}::{1} convert{1}FromLLVM(int64_t "
+                "value) {{\n",
+                cppNamespace, cppClassName);
   os << "  switch (value) {\n";
 
   for (const auto &enumerant : enumInfo.getAllCases()) {
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index 40bc1a9..c3034bb8 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -2120,7 +2120,7 @@ static void emitRewriters(const RecordKeeper &records, raw_ostream &os) {
   }
 
   // Emit function to add the generated matchers to the pattern list.
-  os << "void LLVM_ATTRIBUTE_UNUSED populateWithGenerated("
+  os << "[[maybe_unused]] void populateWithGenerated("
         "::mlir::RewritePatternSet &patterns) {\n";
   for (const auto &name : rewriterNames) {
     os << "  patterns.add<" << name << ">(patterns.getContext());\n";
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index fe26fc1..2a58305 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -113,8 +113,7 @@ static Match tensorMatch(TensorId tid) { return Match(tid); }
 static Match synZeroMatch() { return Match(); }
 
 #define IMPL_BINOP_PATTERN(OP, KIND)                                           \
-  LLVM_ATTRIBUTE_UNUSED static Match OP##Match(const Match &e0,                \
-                                               const Match &e1) {              \
+  [[maybe_unused]] static Match OP##Match(const Match &e0, const Match &e1) {  \
     return Match(KIND, e0, e1);                                                \
   }
 FOREVERY_BINOP(IMPL_BINOP_PATTERN)
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 102c416..0a84bd2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -633,6 +633,34 @@ filegroup(
 )
 
 ##---------------------------------------------------------------------------##
+# OpenACC dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+    name = "OpenACCOpsPyGen",
+    tbl_outs = {"mlir/dialects/_acc_ops_gen.py": [
+        "-gen-python-op-bindings",
+        "-bind-dialect=acc",
+    ]},
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/OpenACCOps.td",
+    deps = [
+        "//mlir:BuiltinDialectTdFiles",
+        "//mlir:OpBaseTdFiles",
+        "//mlir:OpenAccOpsTdFiles",
+        "//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+filegroup(
+    name = "OpenACCOpsPyFiles",
+    srcs = [
+        "mlir/dialects/openacc.py",
+        ":OpenACCOpsPyGen",
+    ],
+)
+
+##---------------------------------------------------------------------------##
 # OpenMP dialect.
 ##---------------------------------------------------------------------------##