199 files changed, 8858 insertions, 1490 deletions
diff --git a/.ci/premerge_advisor_upload.py b/.ci/premerge_advisor_upload.py
new file mode 100644
index 0000000..84214f8
--- /dev/null
+++ b/.ci/premerge_advisor_upload.py
@@ -0,0 +1,56 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Script for uploading results to the premerge advisor."""
+
+import argparse
+import os
+import platform
+import sys
+
+import requests
+
+import generate_test_report_lib
+
+PREMERGE_ADVISOR_URL = (
+    "http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/upload"
+)
+
+
+def main(commit_sha, workflow_run_number, build_log_files):
+    junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
+        build_log_files
+    )
+    test_failures = generate_test_report_lib.get_failures(junit_objects)
+    source = "pull_request" if "GITHUB_ACTIONS" in os.environ else "postcommit"
+    failure_info = {
+        "source_type": source,
+        "base_commit_sha": commit_sha,
+        "source_id": workflow_run_number,
+        "failures": [],
+    }
+    if test_failures:
+        for name, failure_message in test_failures:
+            failure_info["failures"].append({"name": name, "message": failure_message})
+    else:
+        ninja_failures = generate_test_report_lib.find_failure_in_ninja_logs(ninja_logs)
+        for name, failure_message in ninja_failures:
+            failure_info["failures"].append({"name": name, "message": failure_message})
+    requests.post(PREMERGE_ADVISOR_URL, json=failure_info)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("commit_sha", help="The base commit SHA for the test.")
+    parser.add_argument("workflow_run_number", help="The run number from GHA.")
+    parser.add_argument(
+        "build_log_files", help="Paths to JUnit report files and ninja logs.", nargs="*"
+    )
+    args = parser.parse_args()
+
+    # Skip uploading results on AArch64 for now because the premerge advisor
+    # service is not available on AWS currently.
+    if platform.machine() == "arm64":
+        sys.exit(0)
+
+    main(args.commit_sha, args.workflow_run_number, args.build_log_files)
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 5d32968..9aefcf2 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -38,6 +38,12 @@ function at-exit {
       $retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log \
       >> $GITHUB_STEP_SUMMARY
   fi
+
+  if [[ "$retcode" != "0" ]]; then
+    python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
+      $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
+      "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
+  fi
 }
 trap at-exit EXIT
 
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index af4cc9d..bc37ced 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -261,6 +261,82 @@ TEST_P(MCPlusBuilderTester, testAccessedRegsMultipleDefs) {
                 {AArch64::W5, AArch64::X5, AArch64::W5_HI});
 }
 
+TEST_P(MCPlusBuilderTester, AArch64_Psign_Pauth_variants) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  MCInst Paciasp = MCInstBuilder(AArch64::PACIASP);
+  MCInst Pacibsp = MCInstBuilder(AArch64::PACIBSP);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Paciasp));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Pacibsp));
+
+  MCInst PaciaSPLR =
+      MCInstBuilder(AArch64::PACIA).addReg(AArch64::LR).addReg(AArch64::SP);
+  MCInst PacibSPLR =
+      MCInstBuilder(AArch64::PACIB).addReg(AArch64::LR).addReg(AArch64::SP);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(PaciaSPLR));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(PacibSPLR));
+
+  MCInst PacizaX5 = MCInstBuilder(AArch64::PACIZA).addReg(AArch64::X5);
+  MCInst PacizbX5 = MCInstBuilder(AArch64::PACIZB).addReg(AArch64::X5);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(PacizaX5));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(PacizbX5));
+
+  MCInst Paciaz = MCInstBuilder(AArch64::PACIZA).addReg(AArch64::LR);
+  MCInst Pacibz = MCInstBuilder(AArch64::PACIZB).addReg(AArch64::LR);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Paciaz));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Pacibz));
+
+  MCInst Pacia1716 = MCInstBuilder(AArch64::PACIA1716);
+  MCInst Pacib1716 = MCInstBuilder(AArch64::PACIB1716);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacia1716));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacib1716));
+
+  MCInst Pacia171615 = MCInstBuilder(AArch64::PACIA171615);
+  MCInst Pacib171615 = MCInstBuilder(AArch64::PACIB171615);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacia171615));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacib171615));
+
+  MCInst Autiasp = MCInstBuilder(AArch64::AUTIASP);
+  MCInst Autibsp = MCInstBuilder(AArch64::AUTIBSP);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autiasp));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autibsp));
+
+  MCInst AutiaSPLR =
+      MCInstBuilder(AArch64::AUTIA).addReg(AArch64::LR).addReg(AArch64::SP);
+  MCInst AutibSPLR =
+      MCInstBuilder(AArch64::AUTIB).addReg(AArch64::LR).addReg(AArch64::SP);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(AutiaSPLR));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(AutibSPLR));
+
+  MCInst AutizaX5 = MCInstBuilder(AArch64::AUTIZA).addReg(AArch64::X5);
+  MCInst AutizbX5 = MCInstBuilder(AArch64::AUTIZB).addReg(AArch64::X5);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(AutizaX5));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(AutizbX5));
+
+  MCInst Autiaz = MCInstBuilder(AArch64::AUTIZA).addReg(AArch64::LR);
+  MCInst Autibz = MCInstBuilder(AArch64::AUTIZB).addReg(AArch64::LR);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autiaz));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autibz));
+
+  MCInst Autia1716 = MCInstBuilder(AArch64::AUTIA1716);
+  MCInst Autib1716 = MCInstBuilder(AArch64::AUTIB1716);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autia1716));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autib1716));
+
+  MCInst Autia171615 = MCInstBuilder(AArch64::AUTIA171615);
+  MCInst Autib171615 = MCInstBuilder(AArch64::AUTIB171615);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autia171615));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autib171615));
+
+  MCInst Retaa = MCInstBuilder(AArch64::RETAA);
+  MCInst Retab = MCInstBuilder(AArch64::RETAB);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Retaa));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Retab));
+  ASSERT_TRUE(BC->MIB->isPAuthAndRet(Retaa));
+  ASSERT_TRUE(BC->MIB->isPAuthAndRet(Retab));
+}
+
 #endif // AARCH64_AVAILABLE
 
 #ifdef X86_AVAILABLE
diff --git a/clang-tools-extra/clang-tidy/.clang-format b/clang-tools-extra/clang-tidy/.clang-format
index d18cf7c..5b50661 100644
--- a/clang-tools-extra/clang-tidy/.clang-format
+++ b/clang-tools-extra/clang-tidy/.clang-format
@@ -1,2 +1,3 @@
 BasedOnStyle: LLVM
 QualifierAlignment: Left
+LineEnding: LF
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index d942578..dcfa4e3 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -205,6 +205,50 @@ pointers with a specified address space. If the option is set to false, then
 reports from the specific x86 address spaces 256, 257 and 258 are still
 suppressed, but null dereferences from other address spaces are reported.
 
+.. _core-NullPointerArithm:
+
+core.NullPointerArithm (C, C++)
+"""""""""""""""""""""""""""""""
+Check for undefined arithmetic operations with null pointers.
+
+The checker can detect the following cases:
+
+  - ``p + x`` and ``x + p`` where ``p`` is a null pointer and ``x`` is a nonzero
+    integer value.
+  - ``p - x`` where ``p`` is a null pointer and ``x`` is a nonzero integer
+    value.
+  - ``p1 - p2`` where one of ``p1`` and ``p2`` is null and the other a
+    non-null pointer.
+
+Result of these operations is undefined according to the standard.
+In the above listed cases, the checker will warn even if the expression
+described to be "nonzero" or "non-null" has unknown value, because it is likely
+that it can have non-zero value during the program execution.
+
+.. code-block:: c
+
+ void test1(int *p, int offset) {
+   if (p)
+     return;
+
+   int *p1 = p + offset; // warn: 'p' is null, 'offset' is unknown but likely non-zero
+ }
+
+ void test2(int *p, int offset) {
+   if (p) { } // this indicates that it is possible for 'p' to be null
+   if (offset == 0)
+     return;
+
+   int *p1 = p - offset; // warn: 'p' is null, 'offset' is known to be non-zero
+ }
+
+ void test3(char *p1, char *p2) {
+   if (p1)
+     return;
+
+   int a = p1 - p2; // warn: 'p1' is null, 'p2' can be likely non-null
+ }
+
 .. _core-StackAddressEscape:
 
 core.StackAddressEscape (C)
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 9e85008..5f70b51 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -243,6 +243,7 @@ ENUM_LANGOPT(HLSLVersion, HLSLLangStd, 16, HLSL_Unset, NotCompatible, "HLSL Vers
 LANGOPT(HLSLStrictAvailability, 1, 0, NotCompatible,
         "Strict availability diagnostic mode for HLSL built-in functions.")
 LANGOPT(HLSLSpvUseUnknownImageFormat, 1, 0, NotCompatible, "For storage images and texel buffers, sets the default format to 'Unknown' when not specified via the `vk::image_format` attribute. If this option is not used, the format is inferred from the resource's data type.")
+LANGOPT(HLSLSpvEnableMaximalReconvergence, 1, 0, NotCompatible, "Enables the MaximallyReconvergesKHR execution mode for this module. This ensures that control flow reconverges at well-defined merge points as defined by the Vulkan spec.")
 
 LANGOPT(CUDAIsDevice      , 1, 0, NotCompatible, "compiling for CUDA device")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, NotCompatible, "treating unattributed constexpr functions as __host__ __device__")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 75c275b..7ae153d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -9601,6 +9601,15 @@ def fhlsl_spv_use_unknown_image_format
                "from the resource's data type.">,
       MarshallingInfoFlag<LangOpts<"HLSLSpvUseUnknownImageFormat">>;
 
+def fhlsl_spv_enable_maximal_reconvergence
+    : Flag<["-"], "fspv-enable-maximal-reconvergence">,
+      Group<dxc_Group>,
+      Visibility<[CC1Option, DXCOption]>,
+      HelpText<"Enables the MaximallyReconvergesKHR execution mode for this "
+               "module. This ensures that control flow reconverges at "
+               "well-defined merge points as defined by the Vulkan spec.">,
+      MarshallingInfoFlag<LangOpts<"HLSLSpvEnableMaximalReconvergence">>;
+
 def no_wasm_opt : Flag<["--"], "no-wasm-opt">,
   Group<m_Group>,
   HelpText<"Disable the wasm-opt optimizer">,
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 4473c54..b83bbcd 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -195,6 +195,11 @@ def NullDereferenceChecker
       HelpText<"Check for dereferences of null pointers">,
       Documentation<HasDocumentation>;
 
+def NullPointerArithmChecker
+    : Checker<"NullPointerArithm">,
+      HelpText<"Check for undefined arithmetic operations on null pointers">,
+      Documentation<HasDocumentation>;
+
 def NonNullParamChecker : Checker<"NonNullParamChecker">,
   HelpText<"Check for null pointers passed as arguments to a function whose "
            "arguments are references or marked with the 'nonnull' attribute">,
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 8904396..a72282c 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1358,9 +1358,6 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm,
 
 void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED,
                        const APSInt &Value) {
-  if (S.EvaluatingDecl && !S.EvaluatingDecl->isConstexpr())
-    return;
-
   llvm::APInt Min;
   llvm::APInt Max;
   ED->getValueRange(Max, Min);
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 57cc705..812d25f 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -3096,7 +3096,8 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   }
 
   if (Offset.isZero()) {
-    if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) {
+    if (const Descriptor *Desc = Ptr.getFieldDesc();
+        Desc && Desc->isArray() && Ptr.getIndex() == 0) {
       S.Stk.push<Pointer>(Ptr.atIndex(0).narrow());
       return true;
     }
@@ -3126,7 +3127,8 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   }
 
   if (Offset.isZero()) {
-    if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) {
+    if (const Descriptor *Desc = Ptr.getFieldDesc();
+        Desc && Desc->isArray() && Ptr.getIndex() == 0) {
       S.Stk.push<Pointer>(Ptr.atIndex(0).narrow());
       return true;
     }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index a6f10e6..84acc74 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -519,6 +519,14 @@ public:
     return createGlobal(module, loc, uniqueName, type, isConstant, linkage);
   }
 
+  cir::StackSaveOp createStackSave(mlir::Location loc, mlir::Type ty) {
+    return cir::StackSaveOp::create(*this, loc, ty);
+  }
+
+  cir::StackRestoreOp createStackRestore(mlir::Location loc, mlir::Value v) {
+    return cir::StackRestoreOp::create(*this, loc, v);
+  }
+
   mlir::Value createSetBitfield(mlir::Location loc, mlir::Type resultType,
                                 Address dstAddr, mlir::Type storageType,
                                 mlir::Value src, const CIRGenBitFieldInfo &info,
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 039d290..4a19d91 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -44,38 +44,70 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
 
   // If the type is variably-modified, emit all the VLA sizes for it.
   if (ty->isVariablyModifiedType())
-    cgm.errorNYI(d.getSourceRange(), "emitAutoVarDecl: variably modified type");
+    emitVariablyModifiedType(ty);
 
   assert(!cir::MissingFeatures::openMP());
 
   Address address = Address::invalid();
-  if (!ty->isConstantSizeType())
-    cgm.errorNYI(d.getSourceRange(), "emitAutoVarDecl: non-constant size type");
-
-  // A normal fixed sized variable becomes an alloca in the entry block,
-  // unless:
-  // - it's an NRVO variable.
-  // - we are compiling OpenMP and it's an OpenMP local variable.
-  if (nrvo) {
-    // The named return value optimization: allocate this variable in the
-    // return slot, so that we can elide the copy when returning this
-    // variable (C++0x [class.copy]p34).
-    address = returnValue;
-
-    if (const RecordDecl *rd = ty->getAsRecordDecl()) {
-      if (const auto *cxxrd = dyn_cast<CXXRecordDecl>(rd);
-          (cxxrd && !cxxrd->hasTrivialDestructor()) ||
-          rd->isNonTrivialToPrimitiveDestroy())
-        cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: set NRVO flag");
+  if (ty->isConstantSizeType()) {
+    // A normal fixed sized variable becomes an alloca in the entry block,
+    // unless:
+    // - it's an NRVO variable.
+    // - we are compiling OpenMP and it's an OpenMP local variable.
+    if (nrvo) {
+      // The named return value optimization: allocate this variable in the
+      // return slot, so that we can elide the copy when returning this
+      // variable (C++0x [class.copy]p34).
+      address = returnValue;
+
+      if (const RecordDecl *rd = ty->getAsRecordDecl()) {
+        if (const auto *cxxrd = dyn_cast<CXXRecordDecl>(rd);
+            (cxxrd && !cxxrd->hasTrivialDestructor()) ||
+            rd->isNonTrivialToPrimitiveDestroy())
+          cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: set NRVO flag");
+      }
+    } else {
+      // A normal fixed sized variable becomes an alloca in the entry block,
+      mlir::Type allocaTy = convertTypeForMem(ty);
+      // Create the temp alloca and declare variable using it.
+      address = createTempAlloca(allocaTy, alignment, loc, d.getName(),
+                                 /*arraySize=*/nullptr, /*alloca=*/nullptr, ip);
+      declare(address.getPointer(), &d, ty, getLoc(d.getSourceRange()),
+              alignment);
     }
   } else {
-    // A normal fixed sized variable becomes an alloca in the entry block,
-    mlir::Type allocaTy = convertTypeForMem(ty);
-    // Create the temp alloca and declare variable using it.
-    address = createTempAlloca(allocaTy, alignment, loc, d.getName(),
-                               /*arraySize=*/nullptr, /*alloca=*/nullptr, ip);
-    declare(address.getPointer(), &d, ty, getLoc(d.getSourceRange()),
-            alignment);
+    // Non-constant size type
+    assert(!cir::MissingFeatures::openMP());
+    if (!didCallStackSave) {
+      // Save the stack.
+      cir::PointerType defaultTy = AllocaInt8PtrTy;
+      CharUnits align = CharUnits::fromQuantity(
+          cgm.getDataLayout().getAlignment(defaultTy, false));
+      Address stack = createTempAlloca(defaultTy, align, loc, "saved_stack");
+
+      mlir::Value v = builder.createStackSave(loc, defaultTy);
+      assert(v.getType() == AllocaInt8PtrTy);
+      builder.createStore(loc, v, stack);
+
+      didCallStackSave = true;
+
+      // Push a cleanup block and restore the stack there.
+      // FIXME: in general circumstances, this should be an EH cleanup.
+      pushStackRestore(NormalCleanup, stack);
+    }
+
+    VlaSizePair vlaSize = getVLASize(ty);
+    mlir::Type memTy = convertTypeForMem(vlaSize.type);
+
+    // Allocate memory for the array.
+    address =
+        createTempAlloca(memTy, alignment, loc, d.getName(), vlaSize.numElts,
+                         /*alloca=*/nullptr, builder.saveInsertionPoint());
+
+    // If we have debug info enabled, properly describe the VLA dimensions for
+    // this type by registering the vla size expression for each of the
+    // dimensions.
+    assert(!cir::MissingFeatures::generateDebugInfo());
   }
 
   emission.addr = address;
@@ -696,6 +728,16 @@ struct DestroyObject final : EHScopeStack::Cleanup {
     cgf.emitDestroy(addr, type, destroyer);
   }
 };
+
+struct CallStackRestore final : EHScopeStack::Cleanup {
+  Address stack;
+  CallStackRestore(Address stack) : stack(stack) {}
+  void emit(CIRGenFunction &cgf) override {
+    mlir::Location loc = stack.getPointer().getLoc();
+    mlir::Value v = cgf.getBuilder().createLoad(loc, stack);
+    cgf.getBuilder().createStackRestore(loc, v);
+  }
+};
 } // namespace
 
 void CIRGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr,
@@ -805,6 +847,10 @@ CIRGenFunction::getDestroyer(QualType::DestructionKind kind) {
   llvm_unreachable("Unknown DestructionKind");
 }
 
+void CIRGenFunction::pushStackRestore(CleanupKind kind, Address spMem) {
+  ehStack.pushCleanup<CallStackRestore>(kind, spMem);
+}
+
 /// Enter a destroy cleanup for the given local variable.
 void CIRGenFunction::emitAutoVarTypeCleanup(
     const CIRGenFunction::AutoVarEmission &emission,
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index f416571..4897c29 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -2068,7 +2068,7 @@ mlir::Value CIRGenFunction::emitAlloca(StringRef name, mlir::Type ty,
     mlir::OpBuilder::InsertionGuard guard(builder);
     builder.restoreInsertionPoint(ip);
     addr = builder.createAlloca(loc, /*addr type*/ localVarPtrTy,
-                                /*var type*/ ty, name, alignIntAttr);
+                                /*var type*/ ty, name, alignIntAttr, arraySize);
     assert(!cir::MissingFeatures::astVarDeclInterface());
   }
   return addr;
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 01a43a99..ba36cbe 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -410,6 +410,8 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
   curFn = fn;
 
   const Decl *d = gd.getDecl();
+
+  didCallStackSave = false;
   curCodeDecl = d;
   const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
   curFuncDecl = d->getNonClosureContext();
@@ -1006,6 +1008,41 @@ mlir::Value CIRGenFunction::emitAlignmentAssumption(
                                  offsetValue);
 }
 
+CIRGenFunction::VlaSizePair CIRGenFunction::getVLASize(QualType type) {
+  const VariableArrayType *vla =
+      cgm.getASTContext().getAsVariableArrayType(type);
+  assert(vla && "type was not a variable array type!");
+  return getVLASize(vla);
+}
+
+CIRGenFunction::VlaSizePair
+CIRGenFunction::getVLASize(const VariableArrayType *type) {
+  // The number of elements so far; always size_t.
+  mlir::Value numElements;
+
+  QualType elementType;
+  do {
+    elementType = type->getElementType();
+    mlir::Value vlaSize = vlaSizeMap[type->getSizeExpr()];
+    assert(vlaSize && "no size for VLA!");
+    assert(vlaSize.getType() == SizeTy);
+
+    if (!numElements) {
+      numElements = vlaSize;
+    } else {
+      // It's undefined behavior if this wraps around, so mark it that way.
+      // FIXME: Teach -fsanitize=undefined to trap this.
+
+      numElements =
+          builder.createMul(numElements.getLoc(), numElements, vlaSize,
+                            cir::OverflowBehavior::NoUnsignedWrap);
+    }
+  } while ((type = getContext().getAsVariableArrayType(elementType)));
+
+  assert(numElements && "Undefined elements number");
+  return {numElements, elementType};
+}
+
 // TODO(cir): Most of this function can be shared between CIRGen
 // and traditional LLVM codegen
 void CIRGenFunction::emitVariablyModifiedType(QualType type) {
@@ -1086,7 +1123,26 @@ void CIRGenFunction::emitVariablyModifiedType(QualType type) {
       break;
 
     case Type::VariableArray: {
-      cgm.errorNYI("CIRGenFunction::emitVariablyModifiedType VLA");
+      // Losing element qualification here is fine.
+      const VariableArrayType *vat = cast<clang::VariableArrayType>(ty);
+
+      // Unknown size indication requires no size computation.
+      // Otherwise, evaluate and record it.
+      if (const Expr *sizeExpr = vat->getSizeExpr()) {
+        // It's possible that we might have emitted this already,
+        // e.g. with a typedef and a pointer to it.
+        mlir::Value &entry = vlaSizeMap[sizeExpr];
+        if (!entry) {
+          mlir::Value size = emitScalarExpr(sizeExpr);
+          assert(!cir::MissingFeatures::sanitizers());
+
+          // Always zexting here would be wrong if it weren't
+          // undefined behavior to have a negative bound.
+          // FIXME: What about when size's type is larger than size_t?
+          entry = builder.createIntCast(size, SizeTy);
+        }
+      }
+      type = vat->getElementType();
       break;
     }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index d71de2f..0d64c31 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -149,6 +149,10 @@ public:
   using SymTableTy = llvm::ScopedHashTable<const clang::Decl *, mlir::Value>;
   SymTableTy symbolTable;
 
+  /// Whether a cir.stacksave operation has been added. Used to avoid
+  /// inserting cir.stacksave for multiple VLAs in the same scope.
+  bool didCallStackSave = false;
+
   /// Whether or not a Microsoft-style asm block has been processed within
   /// this fuction. These can potentially set the return value.
   bool sawAsmBlock = false;
@@ -188,6 +192,14 @@ public:
   llvm::DenseMap<const OpaqueValueExpr *, LValue> opaqueLValues;
   llvm::DenseMap<const OpaqueValueExpr *, RValue> opaqueRValues;
 
+  // This keeps track of the associated size for each VLA type.
+  // We track this by the size expression rather than the type itself because
+  // in certain situations, like a const qualifier applied to an VLA typedef,
+  // multiple VLA types can share the same size expression.
+  // FIXME: Maybe this could be a stack of maps that is pushed/popped as we
+  // enter/leave scopes.
+  llvm::DenseMap<const Expr *, mlir::Value> vlaSizeMap;
+
 public:
   /// A non-RAII class containing all the information about a bound
   /// opaque value.  OpaqueValueMapping, below, is a RAII wrapper for
@@ -436,6 +448,20 @@ public:
     }
   };
 
+  struct VlaSizePair {
+    mlir::Value numElts;
+    QualType type;
+
+    VlaSizePair(mlir::Value num, QualType ty) : numElts(num), type(ty) {}
+  };
+
+  /// Returns an MLIR::Value+QualType pair that corresponds to the size,
+  /// in non-variably-sized elements, of a variable length array type,
+  /// plus that largest non-variably-sized element type.  Assumes that
+  /// the type has already been emitted with emitVariablyModifiedType.
+  VlaSizePair getVLASize(const VariableArrayType *type);
+  VlaSizePair getVLASize(QualType type);
+
   void finishFunction(SourceLocation endLoc);
 
   /// Determine whether the given initializer is trivial in the sense
@@ -583,6 +609,8 @@ public:
     return needsEHCleanup(kind) ? NormalAndEHCleanup : NormalCleanup;
   }
 
+  void pushStackRestore(CleanupKind kind, Address spMem);
+
   /// Set the address of a local variable.
   void setAddrOfLocalVar(const clang::VarDecl *vd, Address addr) {
     assert(!localDeclMap.count(vd) && "Decl already exists in LocalDeclMap!");
@@ -854,6 +882,7 @@ public:
 
   protected:
     bool performCleanup;
+    bool oldDidCallStackSave;
 
   private:
     RunCleanupsScope(const RunCleanupsScope &) = delete;
@@ -867,6 +896,8 @@ public:
     explicit RunCleanupsScope(CIRGenFunction &cgf)
         : performCleanup(true), cgf(cgf) {
       cleanupStackDepth = cgf.ehStack.stable_begin();
+      oldDidCallStackSave = cgf.didCallStackSave;
+      cgf.didCallStackSave = false;
       oldCleanupStackDepth = cgf.currentCleanupStackDepth;
       cgf.currentCleanupStackDepth = cleanupStackDepth;
     }
@@ -883,6 +914,7 @@ public:
       assert(performCleanup && "Already forced cleanup");
       {
         mlir::OpBuilder::InsertionGuard guard(cgf.getBuilder());
+        cgf.didCallStackSave = oldDidCallStackSave;
         cgf.popCleanupBlocks(cleanupStackDepth);
         performCleanup = false;
         cgf.currentCleanupStackDepth = oldCleanupStackDepth;
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 82b1051..57c7a44 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -88,6 +88,8 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
   FP80Ty = cir::FP80Type::get(&getMLIRContext());
   FP128Ty = cir::FP128Type::get(&getMLIRContext());
 
+  AllocaInt8PtrTy = cir::PointerType::get(UInt8Ty, cirAllocaAddressSpace);
+
   PointerAlignInBytes =
       astContext
           .toCharUnitsFromBits(
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index ce14aa8..f638d39 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -398,6 +398,7 @@ void OpenACCRecipeBuilderBase::createRecipeDestroySection(
     emitDestroy(block->getArgument(1), elementTy);
   }
 
+  ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }
 void OpenACCRecipeBuilderBase::makeBoundsInit(
@@ -480,6 +481,7 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
                      /*isInitSection=*/true);
   }
 
+  ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }
 
@@ -518,6 +520,7 @@ void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
   cgf.emitAutoVarInit(tempDeclEmission);
 
   builder.setInsertionPointToEnd(&copyRegion.back());
+  ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }
 
@@ -662,6 +665,7 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
   }
 
   builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
+  ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd, block->getArgument(0));
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index 273ec7f..b5612d9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -65,6 +65,9 @@ struct CIRGenTypeCache {
   cir::PointerType VoidPtrTy;
   cir::PointerType UInt8PtrTy;
 
+  /// void* in alloca address space
+  cir::PointerType AllocaInt8PtrTy;
+
   /// The size and alignment of a pointer into the generic address space.
   union {
     unsigned char PointerAlignInBytes;
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index b6d3c95..d1b91d0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -420,6 +420,16 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     break;
   }
 
+  case Type::VariableArray: {
+    const VariableArrayType *a = cast<VariableArrayType>(ty);
+    if (a->getIndexTypeCVRQualifiers() != 0)
+      cgm.errorNYI(SourceLocation(), "non trivial array types", type);
+    // VLAs resolve to the innermost element type; this matches
+    // the return of alloca, and there isn't any obviously better choice.
+    resultType = convertTypeForMem(a->getElementType());
+    break;
+  }
+
   case Type::IncompleteArray: {
     const IncompleteArrayType *arrTy = cast<IncompleteArrayType>(ty);
     if (arrTy->getIndexTypeCVRQualifiers() != 0)
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 85c70de..12e2813ef 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -647,6 +647,68 @@ StringRef CGDebugInfo::getCurrentDirname() {
   return CGM.getCodeGenOpts().DebugCompilationDir;
 }
 
+static llvm::dwarf::SourceLanguage GetSourceLanguage(const CodeGenModule &CGM) {
+  const CodeGenOptions &CGO = CGM.getCodeGenOpts();
+  const LangOptions &LO = CGM.getLangOpts();
+
+  assert(CGO.DwarfVersion <= 5);
+
+  llvm::dwarf::SourceLanguage LangTag;
+  if (LO.CPlusPlus) {
+    if (LO.ObjC)
+      LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus;
+    else if (CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
+    else if (LO.CPlusPlus14)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_14;
+    else if (LO.CPlusPlus11)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_11;
+    else
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
+  } else if (LO.ObjC) {
+    LangTag = llvm::dwarf::DW_LANG_ObjC;
+  } else if (LO.OpenCL && (!CGO.DebugStrictDwarf || CGO.DwarfVersion >= 5)) {
+    LangTag = llvm::dwarf::DW_LANG_OpenCL;
+  } else if (LO.C11 && !(CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)) {
+    LangTag = llvm::dwarf::DW_LANG_C11;
+  } else if (LO.C99) {
+    LangTag = llvm::dwarf::DW_LANG_C99;
+  } else {
+    LangTag = llvm::dwarf::DW_LANG_C89;
+  }
+
+  return LangTag;
+}
+
+static llvm::DISourceLanguageName
+GetDISourceLanguageName(const CodeGenModule &CGM) {
+  // Emit pre-DWARFv6 language codes.
+  if (CGM.getCodeGenOpts().DwarfVersion < 6)
+    return llvm::DISourceLanguageName(GetSourceLanguage(CGM));
+
+  const LangOptions &LO = CGM.getLangOpts();
+
+  uint32_t LangVersion = 0;
+  llvm::dwarf::SourceLanguageName LangTag;
+  if (LO.CPlusPlus) {
+    if (LO.ObjC) {
+      LangTag = llvm::dwarf::DW_LNAME_ObjC_plus_plus;
+    } else {
+      LangTag = llvm::dwarf::DW_LNAME_C_plus_plus;
+      LangVersion = LO.getCPlusPlusLangStd().value_or(0);
+    }
+  } else if (LO.ObjC) {
+    LangTag = llvm::dwarf::DW_LNAME_ObjC;
+  } else if (LO.OpenCL) {
+    LangTag = llvm::dwarf::DW_LNAME_OpenCL_C;
+  } else {
+    LangTag = llvm::dwarf::DW_LNAME_C;
+    LangVersion = LO.getCLangStd().value_or(0);
+  }
+
+  return llvm::DISourceLanguageName(LangTag, LangVersion);
+}
+
 void CGDebugInfo::CreateCompileUnit() {
   SmallString<64> Checksum;
   std::optional<llvm::DIFile::ChecksumKind> CSKind;
@@ -702,31 +764,6 @@ void CGDebugInfo::CreateCompileUnit() {
     }
   }
 
-  llvm::dwarf::SourceLanguage LangTag;
-  if (LO.CPlusPlus) {
-    if (LO.ObjC)
-      LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus;
-    else if (CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
-    else if (LO.CPlusPlus14)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_14;
-    else if (LO.CPlusPlus11)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_11;
-    else
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
-  } else if (LO.ObjC) {
-    LangTag = llvm::dwarf::DW_LANG_ObjC;
-  } else if (LO.OpenCL && (!CGM.getCodeGenOpts().DebugStrictDwarf ||
-                           CGM.getCodeGenOpts().DwarfVersion >= 5)) {
-    LangTag = llvm::dwarf::DW_LANG_OpenCL;
-  } else if (LO.C11 && !(CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)) {
-      LangTag = llvm::dwarf::DW_LANG_C11;
-  } else if (LO.C99) {
-    LangTag = llvm::dwarf::DW_LANG_C99;
-  } else {
-    LangTag = llvm::dwarf::DW_LANG_C89;
-  }
-
   std::string Producer = getClangFullVersion();
 
   // Figure out which version of the ObjC runtime we have.
@@ -787,7 +824,7 @@ void CGDebugInfo::CreateCompileUnit() {
 
   // Create new compile unit.
   TheCU = DBuilder.createCompileUnit(
-      llvm::DISourceLanguageName(LangTag), CUFile,
+      GetDISourceLanguageName(CGM), CUFile,
       CGOpts.EmitVersionIdentMetadata ? Producer : "",
       CGOpts.OptimizationLevel != 0 || CGOpts.PrepareForLTO ||
           CGOpts.PrepareForThinLTO,
@@ -1234,20 +1271,46 @@ llvm::DIType *CGDebugInfo::CreateType(const PointerType *Ty,
                                Ty->getPointeeType(), Unit);
 }
 
-/// \return whether a C++ mangling exists for the type defined by TD.
-static bool hasCXXMangling(const TagDecl *TD, llvm::DICompileUnit *TheCU) {
-  switch (TheCU->getSourceLanguage().getUnversionedName()) {
+static bool hasCXXMangling(llvm::dwarf::SourceLanguage Lang, bool IsTagDecl) {
+  switch (Lang) {
   case llvm::dwarf::DW_LANG_C_plus_plus:
   case llvm::dwarf::DW_LANG_C_plus_plus_11:
   case llvm::dwarf::DW_LANG_C_plus_plus_14:
     return true;
   case llvm::dwarf::DW_LANG_ObjC_plus_plus:
-    return isa<CXXRecordDecl>(TD) || isa<EnumDecl>(TD);
+    return IsTagDecl;
+  default:
+    return false;
+  }
+}
+
+static bool hasCXXMangling(llvm::dwarf::SourceLanguageName Lang,
+                           bool IsTagDecl) {
+  switch (Lang) {
+  case llvm::dwarf::DW_LNAME_C_plus_plus:
+    return true;
+  case llvm::dwarf::DW_LNAME_ObjC_plus_plus:
+    return IsTagDecl;
   default:
     return false;
   }
 }
 
+/// \return whether a C++ mangling exists for the type defined by TD.
+static bool hasCXXMangling(const TagDecl *TD, llvm::DICompileUnit *TheCU) {
+  const bool IsTagDecl = isa<CXXRecordDecl>(TD) || isa<EnumDecl>(TD);
+
+  if (llvm::DISourceLanguageName SourceLang = TheCU->getSourceLanguage();
+      SourceLang.hasVersionedName())
+    return hasCXXMangling(
+        static_cast<llvm::dwarf::SourceLanguageName>(SourceLang.getName()),
+        IsTagDecl);
+  else
+    return hasCXXMangling(
+        static_cast<llvm::dwarf::SourceLanguage>(SourceLang.getName()),
+        IsTagDecl);
+}
+
 // Determines if the debug info for this tag declaration needs a type
 // identifier. The purpose of the unique identifier is to deduplicate type
 // information for identical types across TUs. Because of the C++ one definition
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 603cef9..ecab933 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -519,6 +519,10 @@ void clang::CodeGen::CGHLSLRuntime::setHLSLEntryAttributes(
   if (CGM.getCodeGenOpts().OptimizationLevel == 0)
     Fn->addFnAttr(llvm::Attribute::OptimizeNone);
   Fn->addFnAttr(llvm::Attribute::NoInline);
+
+  if (CGM.getLangOpts().HLSLSpvEnableMaximalReconvergence) {
+    Fn->addFnAttr("enable-maximal-reconvergence", "true");
+  }
 }
 
 static Value *buildVectorInput(IRBuilder<> &B, Function *F, llvm::Type *Ty) {
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 1e58c3f..342a3af 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -82,6 +82,8 @@ TargetCodeGenInfo::~TargetCodeGenInfo() = default;
 // If someone can figure out a general rule for this, that would be great.
 // It's probably just doomed to be platform-dependent, though.
 unsigned TargetCodeGenInfo::getSizeOfUnwindException() const {
+  if (getABIInfo().getCodeGenOpts().hasSEHExceptions())
+    return getABIInfo().getDataLayout().getPointerSizeInBits() > 32 ? 64 : 48;
   // Verified for:
   //   x86-64     FreeBSD, Linux, Darwin
   //   x86-32     FreeBSD, Linux, Darwin
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f4bdfa5..a7310ba 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3755,7 +3755,8 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs,
       options::OPT_hlsl_entrypoint,
       options::OPT_fdx_rootsignature_define,
       options::OPT_fdx_rootsignature_version,
-      options::OPT_fhlsl_spv_use_unknown_image_format};
+      options::OPT_fhlsl_spv_use_unknown_image_format,
+      options::OPT_fhlsl_spv_enable_maximal_reconvergence};
   if (!types::isHLSL(InputType))
     return;
   for (const auto &Arg : ForwardedArguments)
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 8ac09c4..04d46d6 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13816,13 +13816,20 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
       VDecl->setInvalidDecl();
   }
 
-  // C++ [module.import/6] external definitions are not permitted in header
-  // units.
+  // C++ [module.import/6]
+  //   ...
+  //   A header unit shall not contain a definition of a non-inline function or
+  //   variable whose name has external linkage.
+  //
+  // We choose to allow weak & selectany definitions, as they are common in
+  // headers, and have semantics similar to inline definitions which are allowed
+  // in header units.
   if (getLangOpts().CPlusPlusModules && currentModuleIsHeaderUnit() &&
       !VDecl->isInvalidDecl() && VDecl->isThisDeclarationADefinition() &&
       VDecl->getFormalLinkage() == Linkage::External && !VDecl->isInline() &&
       !VDecl->isTemplated() && !isa<VarTemplateSpecializationDecl>(VDecl) &&
-      !VDecl->getInstantiatedFromStaticDataMember()) {
+      !VDecl->getInstantiatedFromStaticDataMember() &&
+      !(VDecl->hasAttr<SelectAnyAttr>() || VDecl->hasAttr<WeakAttr>())) {
     Diag(VDecl->getLocation(), diag::err_extern_def_in_header_unit);
     VDecl->setInvalidDecl();
   }
@@ -16153,16 +16160,24 @@ Decl *Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Decl *D,
     }
   }
 
-  // C++ [module.import/6] external definitions are not permitted in header
-  // units.  Deleted and Defaulted functions are implicitly inline (but the
+  // C++ [module.import/6]
+  //   ...
+  //   A header unit shall not contain a definition of a non-inline function or
+  //   variable whose name has external linkage.
+  //
+  // Deleted and Defaulted functions are implicitly inline (but the
   // inline state is not set at this point, so check the BodyKind explicitly).
+  // We choose to allow weak & selectany definitions, as they are common in
+  // headers, and have semantics similar to inline definitions which are allowed
+  // in header units.
   // FIXME: Consider an alternate location for the test where the inlined()
   // state is complete.
   if (getLangOpts().CPlusPlusModules && currentModuleIsHeaderUnit() &&
       !FD->isInvalidDecl() && !FD->isInlined() &&
       BodyKind != FnBodyKind::Delete && BodyKind != FnBodyKind::Default &&
       FD->getFormalLinkage() == Linkage::External && !FD->isTemplated() &&
-      !FD->isTemplateInstantiation()) {
+      !FD->isTemplateInstantiation() &&
+      !(FD->hasAttr<SelectAnyAttr>() || FD->hasAttr<WeakAttr>())) {
     assert(FD->isThisDeclarationADefinition());
     Diag(FD->getLocation(), diag::err_extern_def_in_header_unit);
     FD->setInvalidDecl();
diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
index 395d724..37f5ec3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
@@ -19,6 +19,7 @@
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -39,9 +40,10 @@ public:
 
 class DereferenceChecker
     : public CheckerFamily<check::Location, check::Bind,
+                           check::PreStmt<BinaryOperator>,
                            EventDispatcher<ImplicitNullDerefEvent>> {
-  void reportBug(const DerefBugType &BT, ProgramStateRef State, const Stmt *S,
-                 CheckerContext &C) const;
+  void reportDerefBug(const DerefBugType &BT, ProgramStateRef State,
+                      const Stmt *S, CheckerContext &C) const;
 
   bool suppressReport(CheckerContext &C, const Expr *E) const;
 
@@ -50,6 +52,7 @@ public:
                      CheckerContext &C) const;
   void checkBind(SVal L, SVal V, const Stmt *S, bool AtDeclInit,
                  CheckerContext &C) const;
+  void checkPreStmt(const BinaryOperator *Op, CheckerContext &C) const;
 
   static void AddDerefSource(raw_ostream &os,
                              SmallVectorImpl<SourceRange> &Ranges,
@@ -57,7 +60,7 @@ public:
                              const LocationContext *LCtx,
                              bool loadedFrom = false);
 
-  CheckerFrontend NullDerefChecker, FixedDerefChecker;
+  CheckerFrontend NullDerefChecker, FixedDerefChecker, NullPointerArithmChecker;
   const DerefBugType NullBug{&NullDerefChecker, "Dereference of null pointer",
                              "a null pointer dereference",
                              "a dereference of a null pointer"};
@@ -72,9 +75,22 @@ public:
   const DerefBugType FixedAddressBug{&FixedDerefChecker,
                                      "Dereference of a fixed address",
                                      "a dereference of a fixed address"};
+  const BugType NullPointerArithmBug{
+      &NullPointerArithmChecker,
+      "Possibly undefined arithmetic operation involving a null pointer"};
 
   StringRef getDebugTag() const override { return "DereferenceChecker"; }
 };
+
+struct ValueDescStr {
+  SmallVectorImpl<SourceRange> &Ranges;
+  const Expr *Ex;
+  const ProgramState *State;
+  const LocationContext *LCtx;
+  bool IsPointer;
+  ConditionTruthVal IsNull;
+};
+
 } // end anonymous namespace
 
 void
@@ -173,9 +189,9 @@ static bool isDeclRefExprToReference(const Expr *E) {
   return false;
 }
 
-void DereferenceChecker::reportBug(const DerefBugType &BT,
-                                   ProgramStateRef State, const Stmt *S,
-                                   CheckerContext &C) const {
+void DereferenceChecker::reportDerefBug(const DerefBugType &BT,
+                                        ProgramStateRef State, const Stmt *S,
+                                        CheckerContext &C) const {
   if (&BT == &FixedAddressBug) {
     if (!FixedDerefChecker.isEnabled())
       // Deliberately don't add a sink node if check is disabled.
@@ -249,9 +265,8 @@ void DereferenceChecker::reportBug(const DerefBugType &BT,
 
   bugreporter::trackExpressionValue(N, bugreporter::getDerefExpr(S), *BR);
 
-  for (SmallVectorImpl<SourceRange>::iterator
-       I = Ranges.begin(), E = Ranges.end(); I!=E; ++I)
-    BR->addRange(*I);
+  for (const auto &R : Ranges)
+    BR->addRange(R);
 
   C.emitReport(std::move(BR));
 }
@@ -262,7 +277,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
   if (l.isUndef()) {
     const Expr *DerefExpr = getDereferenceExpr(S);
     if (!suppressReport(C, DerefExpr))
-      reportBug(UndefBug, C.getState(), DerefExpr, C);
+      reportDerefBug(UndefBug, C.getState(), DerefExpr, C);
     return;
   }
 
@@ -283,7 +298,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
       // we call an "explicit" null dereference.
       const Expr *expr = getDereferenceExpr(S);
       if (!suppressReport(C, expr)) {
-        reportBug(NullBug, nullState, expr, C);
+        reportDerefBug(NullBug, nullState, expr, C);
         return;
       }
     }
@@ -301,7 +316,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
   if (location.isConstant()) {
     const Expr *DerefExpr = getDereferenceExpr(S, isLoad);
     if (!suppressReport(C, DerefExpr))
-      reportBug(FixedAddressBug, notNullState, DerefExpr, C);
+      reportDerefBug(FixedAddressBug, notNullState, DerefExpr, C);
     return;
   }
 
@@ -317,7 +332,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
 
   // One should never write to label addresses.
   if (auto Label = L.getAs<loc::GotoLabel>()) {
-    reportBug(LabelBug, C.getState(), S, C);
+    reportDerefBug(LabelBug, C.getState(), S, C);
     return;
   }
 
@@ -338,7 +353,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
     if (!StNonNull) {
       const Expr *expr = getDereferenceExpr(S, /*IsBind=*/true);
       if (!suppressReport(C, expr)) {
-        reportBug(NullBug, StNull, expr, C);
+        reportDerefBug(NullBug, StNull, expr, C);
         return;
       }
     }
@@ -356,7 +371,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   if (V.isConstant()) {
     const Expr *DerefExpr = getDereferenceExpr(S, true);
     if (!suppressReport(C, DerefExpr))
-      reportBug(FixedAddressBug, State, DerefExpr, C);
+      reportDerefBug(FixedAddressBug, State, DerefExpr, C);
     return;
   }
 
@@ -379,6 +394,96 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   C.addTransition(State, this);
 }
 
+namespace llvm {
+template <> struct format_provider<ValueDescStr> {
+  static void format(const ValueDescStr &V, raw_ostream &Stream,
+                     StringRef Style) {
+    static const char *ValueStr[2][3] = {
+        {"zero", "nonzero integer value", "probably nonzero integer value"},
+        {"null pointer", "non-null pointer", "probably non-null pointer"},
+    };
+    Stream
+        << ValueStr[V.IsPointer][V.IsNull.isConstrainedTrue()
+                                     ? 0
+                                     : (V.IsNull.isConstrainedFalse() ? 1 : 2)];
+    DereferenceChecker::AddDerefSource(Stream, V.Ranges, V.Ex, V.State, V.LCtx,
+                                       false);
+  }
+};
+} // namespace llvm
+
+void DereferenceChecker::checkPreStmt(const BinaryOperator *Op,
+                                      CheckerContext &C) const {
+  if (!Op->isAdditiveOp() || !NullPointerArithmChecker.isEnabled())
+    return;
+  const Expr *E1 = Op->getLHS();
+  const Expr *E2 = Op->getRHS();
+  QualType T1 = E1->getType().getCanonicalType();
+  QualType T2 = E2->getType().getCanonicalType();
+  bool T1IsPointer = T1->isPointerType();
+  bool T2IsPointer = T2->isPointerType();
+  if (T1->isIntegerType() && T2->isIntegerType())
+    return;
+  if (!T1IsPointer && !T1->isIntegerType() && !T2IsPointer &&
+      !T2->isIntegerType())
+    return;
+
+  ProgramStateRef State = C.getState();
+  ConditionTruthVal V1IsNull = State->isNull(C.getSVal(E1));
+  ConditionTruthVal V2IsNull = State->isNull(C.getSVal(E2));
+  bool IsConstrained = true;
+
+  // Check cases 'NULL + x' and 'NULL - x'
+  if (T1IsPointer && !T2IsPointer) {
+    if (!V1IsNull.isConstrainedTrue() || V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained = V2IsNull.isConstrainedFalse();
+  }
+
+  // Check case 'x + NULL'
+  if (!T1IsPointer && T2IsPointer) {
+    if (V1IsNull.isConstrainedTrue() || !V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained = V1IsNull.isConstrainedFalse();
+  }
+
+  // Check case 'NULL - p' or 'p - NULL'
+  if (T1IsPointer && T2IsPointer) {
+    if (!V1IsNull.isConstrainedTrue() && !V2IsNull.isConstrainedTrue())
+      return;
+    if (V1IsNull.isConstrainedTrue() && V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained =
+        V1IsNull.isConstrainedFalse() || V2IsNull.isConstrainedFalse();
+  }
+
+  SmallVector<SourceRange, 2> Ranges;
+  const char *OpcodeStr =
+      Op->getOpcode() == BO_Add ? "Addition" : "Subtraction";
+  const char *ResultStr = IsConstrained ? "results" : "may result";
+  ValueDescStr DerefArg1{
+      Ranges, E1, State.get(), C.getLocationContext(), T1IsPointer, V1IsNull};
+  ValueDescStr DerefArg2{
+      Ranges, E2, State.get(), C.getLocationContext(), T2IsPointer, V2IsNull};
+  std::string Msg =
+      llvm::formatv("{0} of a {1} and a {2} {3} in undefined behavior",
+                    OpcodeStr, DerefArg1, DerefArg2, ResultStr);
+
+  ExplodedNode *N = C.generateErrorNode(State);
+  if (!N)
+    return;
+  auto BR =
+      std::make_unique<PathSensitiveBugReport>(NullPointerArithmBug, Msg, N);
+  if (V1IsNull.isConstrainedTrue())
+    bugreporter::trackExpressionValue(N, E1, *BR);
+  if (V2IsNull.isConstrainedTrue())
+    bugreporter::trackExpressionValue(N, E2, *BR);
+  for (const auto &R : Ranges)
+    BR->addRange(R);
+
+  C.emitReport(std::move(BR));
+}
+
 void ento::registerNullDereferenceChecker(CheckerManager &Mgr) {
   Mgr.getChecker<DereferenceChecker>()->NullDerefChecker.enable(Mgr);
 }
@@ -395,3 +500,11 @@ bool ento::shouldRegisterFixedAddressDereferenceChecker(
     const CheckerManager &) {
   return true;
 }
+
+void ento::registerNullPointerArithmChecker(CheckerManager &Mgr) {
+  Mgr.getChecker<DereferenceChecker>()->NullPointerArithmChecker.enable(Mgr);
+}
+
+bool ento::shouldRegisterNullPointerArithmChecker(const CheckerManager &) {
+  return true;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index e45673d..419d263 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -164,7 +164,9 @@ bool tryToFindPtrOrigin(
 
         auto Name = safeGetName(callee);
         if (Name == "__builtin___CFStringMakeConstantString" ||
-            Name == "NSClassFromString")
+            Name == "NSStringFromSelector" || Name == "NSSelectorFromString" ||
+            Name == "NSStringFromClass" || Name == "NSClassFromString" ||
+            Name == "NSStringFromProtocol" || Name == "NSProtocolFromString")
           return callback(E, true);
       } else if (auto *CalleeE = call->getCallee()) {
         if (auto *E = dyn_cast<DeclRefExpr>(CalleeE->IgnoreParenCasts())) {
@@ -202,6 +204,8 @@ bool tryToFindPtrOrigin(
           !Selector.getNumArgs())
         return callback(E, true);
     }
+    if (auto *ObjCProtocol = dyn_cast<ObjCProtocolExpr>(E))
+      return callback(ObjCProtocol, true);
     if (auto *ObjCDict = dyn_cast<ObjCDictionaryLiteral>(E))
       return callback(ObjCDict, true);
     if (auto *ObjCArray = dyn_cast<ObjCArrayLiteral>(E))
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp
index d8539ea..1d4e6dd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp
@@ -263,18 +263,43 @@ public:
   void visitCallArg(const Expr *Arg, const ParmVarDecl *Param,
                     const Decl *DeclWithIssue) const {
     auto *ArgExpr = Arg->IgnoreParenCasts();
-    if (auto *InnerCE = dyn_cast<CallExpr>(Arg)) {
-      auto *InnerCallee = InnerCE->getDirectCallee();
-      if (InnerCallee && InnerCallee->isInStdNamespace() &&
-          safeGetName(InnerCallee) == "move" && InnerCE->getNumArgs() == 1) {
-        ArgExpr = InnerCE->getArg(0);
-        if (ArgExpr)
-          ArgExpr = ArgExpr->IgnoreParenCasts();
+    while (ArgExpr) {
+      ArgExpr = ArgExpr->IgnoreParenCasts();
+      if (auto *InnerCE = dyn_cast<CallExpr>(ArgExpr)) {
+        auto *InnerCallee = InnerCE->getDirectCallee();
+        if (InnerCallee && InnerCallee->isInStdNamespace() &&
+            safeGetName(InnerCallee) == "move" && InnerCE->getNumArgs() == 1) {
+          ArgExpr = InnerCE->getArg(0);
+          continue;
+        }
+      }
+      if (auto *UO = dyn_cast<UnaryOperator>(ArgExpr)) {
+        auto OpCode = UO->getOpcode();
+        if (OpCode == UO_Deref || OpCode == UO_AddrOf) {
+          ArgExpr = UO->getSubExpr();
+          continue;
+        }
       }
+      break;
     }
+
+    if (auto *MemberCallExpr = dyn_cast<CXXMemberCallExpr>(ArgExpr)) {
+      if (isOwnerPtrType(MemberCallExpr->getObjectType()))
+        return;
+    }
+
+    if (auto *OpCE = dyn_cast<CXXOperatorCallExpr>(ArgExpr)) {
+      auto *Method = dyn_cast_or_null<CXXMethodDecl>(OpCE->getDirectCallee());
+      if (Method && isOwnerPtr(safeGetName(Method->getParent()))) {
+        if (OpCE->getOperator() == OO_Star && OpCE->getNumArgs() == 1)
+          return;
+      }
+    }
+
     if (isNullPtr(ArgExpr) || isa<IntegerLiteral>(ArgExpr) ||
         isa<CXXDefaultArgExpr>(ArgExpr))
       return;
+
     if (auto *DRE = dyn_cast<DeclRefExpr>(ArgExpr)) {
       if (auto *ValDecl = DRE->getDecl()) {
         if (isa<ParmVarDecl>(ValDecl))
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index b41e450..d3d1f13 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -138,6 +138,11 @@ bool isCheckedPtr(const std::string &Name) {
   return Name == "CheckedPtr" || Name == "CheckedRef";
 }
 
+bool isOwnerPtr(const std::string &Name) {
+  return isRefType(Name) || isCheckedPtr(Name) || Name == "unique_ptr" ||
+         Name == "UniqueRef" || Name == "LazyUniqueRef";
+}
+
 bool isSmartPtrClass(const std::string &Name) {
   return isRefType(Name) || isCheckedPtr(Name) || isRetainPtrOrOSPtr(Name) ||
          Name == "WeakPtr" || Name == "WeakPtrFactory" ||
@@ -206,10 +211,7 @@ bool isRetainPtrOrOSPtrType(const clang::QualType T) {
 }
 
 bool isOwnerPtrType(const clang::QualType T) {
-  return isPtrOfType(T, [](auto Name) {
-    return isRefType(Name) || isCheckedPtr(Name) || Name == "unique_ptr" ||
-           Name == "UniqueRef" || Name == "LazyUniqueRef";
-  });
+  return isPtrOfType(T, [](auto Name) { return isOwnerPtr(Name); });
 }
 
 std::optional<bool> isUncounted(const QualType T) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 8300a6c..12e2e2d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -143,6 +143,10 @@ bool isCheckedPtr(const std::string &Name);
 /// \returns true if \p Name is RetainPtr or its variant, false if not.
 bool isRetainPtrOrOSPtr(const std::string &Name);
 
+/// \returns true if \p Name is an owning smar pointer such as Ref, CheckedPtr,
+/// and unique_ptr.
+bool isOwnerPtr(const std::string &Name);
+
 /// \returns true if \p Name is a smart pointer type name, false if not.
 bool isSmartPtrClass(const std::string &Name);
 
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 72bc762..8efd320 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -146,6 +146,14 @@ void testValueInRangeOfEnumerationValues() {
 
   const NumberType neg_one = (NumberType) ((NumberType) 0 - (NumberType) 1); // ok, not a constant expression context
 }
+struct EnumTest {
+  enum type {
+      Type1,
+      BOUND
+  };
+  static const type binding_completed = type(BOUND + 1); // both-error {{in-class initializer for static data member is not a constant expression}} \
+                                                         // both-note {{integer value 2 is outside the valid range of values}}
+};
 
 template<class T, unsigned size> struct Bitfield {
   static constexpr T max = static_cast<T>((1 << size) - 1);
diff --git a/clang/test/AST/ByteCode/typeid.cpp b/clang/test/AST/ByteCode/typeid.cpp
index 00b01c8..090309d1 100644
--- a/clang/test/AST/ByteCode/typeid.cpp
+++ b/clang/test/AST/ByteCode/typeid.cpp
@@ -59,3 +59,13 @@ namespace TypeidPtrInEvaluationResult {
   consteval const std::type_info *ftype_info() { return &typeid(c); }
   const std::type_info *T1 = ftype_info();
 }
+
+// Regression test for crash in ArrayElemPtrPop with typeid pointers. GH-163127
+namespace TypeidPtrRegression {
+  void dontcrash() {
+    // this should just be an error and not an ICE
+    constexpr auto res = ((void**)&typeid(int))[0]; // both-error {{must be initialized by a constant expression}} \
+                                                                // both-note {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}}
+  }
+}
+
diff --git a/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm b/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm
index 104b555..8aad838 100644
--- a/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm
+++ b/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm
@@ -11,6 +11,8 @@ class Obj;
 
 Obj* provide_obj_ptr();
 void receive_obj_ptr(Obj* p = nullptr);
+void receive_obj_ref(Obj&);
+void receive_obj_rref(Obj&&);
 sqlite3* open_db();
 void close_db(sqlite3*);
 
@@ -38,6 +40,16 @@ Obj& ref() {
   return obj;
 }
 
+void opaque_call_arg(Obj* obj, Obj&& otherObj, const RefPtr<Obj>& safeObj, WeakPtr<Obj> weakObj, std::unique_ptr<Obj>& uniqObj) {
+  receive_obj_ref(*obj);
+  receive_obj_ptr(&*obj);
+  receive_obj_rref(std::move(otherObj));
+  receive_obj_ref(*safeObj.get());
+  receive_obj_ptr(weakObj.get());
+  // expected-warning@-1{{Call argument for parameter 'p' uses a forward declared type 'Obj *'}}
+  receive_obj_ref(*uniqObj);
+}
+
 Obj&& provide_obj_rval();
 void receive_obj_rval(Obj&& p);
 
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index a49faa1..7055a94 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -25,23 +25,23 @@ namespace std {
 template <typename T>
 class unique_ptr {
 private:
-  T *t;
+  void *t;
 
 public:
   unique_ptr() : t(nullptr) { }
   unique_ptr(T *t) : t(t) { }
   ~unique_ptr() {
     if (t)
-      delete t;
+      delete static_cast<T*>(t);
   }
   template <typename U> unique_ptr(unique_ptr<U>&& u)
     : t(u.t)
   {
     u.t = nullptr;
   }
-  T *get() const { return t; }
-  T *operator->() const { return t; }
-  T &operator*() const { return *t; }
+  T *get() const { return static_cast<T*>(t); }
+  T *operator->() const { return get(); }
+  T &operator*() const { return *get(); }
   unique_ptr &operator=(T *) { return *this; }
   explicit operator bool() const { return !!t; }
 };
@@ -313,4 +313,90 @@ public:
   UniqueRef &operator=(T &) { return *this; }
 };
 
+class WeakPtrImpl {
+private:
+  void* ptr { nullptr };
+  mutable unsigned m_refCount { 0 };
+
+  template <typename U> friend class CanMakeWeakPtr;
+  template <typename U> friend class WeakPtr;
+
+public:
+  template <typename T>
+  static Ref<WeakPtrImpl> create(T& t)
+  {
+    return adoptRef(*new WeakPtrImpl(t));
+  }
+
+  void ref() const { m_refCount++; }
+  void deref() const {
+    m_refCount--;
+    if (!m_refCount)
+      delete const_cast<WeakPtrImpl*>(this);
+  }
+
+  template <typename T>
+  T* get() { return static_cast<T*>(ptr); }
+  operator bool() const { return !!ptr; }
+  void clear() { ptr = nullptr; }
+
+private:
+  template <typename T>
+  WeakPtrImpl(T* t)
+    : ptr(static_cast<void*>(t))
+  { }
+};
+
+template <typename T>
+class CanMakeWeakPtr {
+private:
+  RefPtr<WeakPtrImpl> impl;
+
+  template <typename U> friend class CanMakeWeakPtr;
+  template <typename U> friend class WeakPtr;
+
+  Ref<WeakPtrImpl> createWeakPtrImpl() {
+    if (!impl)
+      impl = WeakPtrImpl::create(static_cast<T>(*this));
+    return *impl;
+  }
+
+public:
+  ~CanMakeWeakPtr() {
+    if (!impl)
+      return;
+    impl->clear();
+    impl = nullptr;
+  }
+};
+
+template <typename T>
+class WeakPtr {
+private:
+  RefPtr<WeakPtrImpl> impl;
+
+public:
+  WeakPtr(T& t) {
+    *this = t;
+  }
+  WeakPtr(T* t) {
+    *this = t;
+  }
+
+  template <typename U>
+  WeakPtr<T> operator=(U& obj) {
+    impl = obj.createWeakPtrImpl();
+  }
+
+  template <typename U>
+  WeakPtr<T> operator=(U* obj) {
+    impl = obj ? obj->createWeakPtrImpl() : nullptr;
+  }
+
+  T* get() {
+    return impl ? impl->get<T>() : nullptr;
+  }
+
+};
+
 #endif
diff --git a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
index a5fc3d7..edf4011 100644
--- a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
@@ -98,12 +98,20 @@ typedef CVImageBufferRef CVPixelBufferRef;
 typedef signed int CVReturn;
 CVReturn CVPixelBufferCreateWithIOSurface(CFAllocatorRef allocator, IOSurfaceRef surface, CFDictionaryRef pixelBufferAttributes, CF_RETURNS_RETAINED CVPixelBufferRef * pixelBufferOut);
 
+extern "C" NSString *NSStringFromSelector(SEL aSelector);
+extern "C" SEL NSSelectorFromString(NSString *aSelectorName);
+
+extern "C" NSString *NSStringFromClass(Class aClass);
+extern "C" Class NSClassFromString(NSString *aClassName);
+
+extern "C" NSString *NSStringFromProtocol(Protocol *proto);
+extern "C" Protocol * NSProtocolFromString(NSString *namestr);
+
 CFRunLoopRef CFRunLoopGetCurrent(void);
 CFRunLoopRef CFRunLoopGetMain(void);
 extern CFTypeRef CFRetain(CFTypeRef cf);
 extern void CFRelease(CFTypeRef cf);
 #define CFSTR(cStr) ((CFStringRef) __builtin___CFStringMakeConstantString ("" cStr ""))
-extern Class NSClassFromString(NSString *aClassName);
 
 #if __has_feature(objc_arc)
 id CFBridgingRelease(CFTypeRef X) {
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index 5dc3b38..4f231ee 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -578,6 +578,24 @@ void foo() {
 
 } // autoreleased
 
+namespace sel_string {
+
+void consumeStr(NSString *);
+void consumeSel(SEL);
+void consumeClass(Class);
+void consumeProtocol(Protocol *);
+
+void foo() {
+  consumeStr(NSStringFromSelector(@selector(mutableCopy)));
+  consumeSel(NSSelectorFromString(@"mutableCopy"));
+  consumeStr(NSStringFromClass(NSNumber.class));
+  consumeClass(NSClassFromString(@"NSNumber"));
+  consumeStr(NSStringFromProtocol(@protocol(NSCopying)));
+  consumeProtocol(NSProtocolFromString(@"NSCopying"));
+}
+
+} // namespace sel_string
+
 @interface TestObject : NSObject
 - (void)doWork:(NSString *)msg, ...;
 - (void)doWorkOnSelf;
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 0092331..bfe418b 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -19,6 +19,7 @@
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
+// CHECK-NEXT: core.NullPointerArithm
 // CHECK-NEXT: core.StackAddressEscape
 // CHECK-NEXT: core.UndefinedBinaryOperatorResult
 // CHECK-NEXT: core.VLASize
diff --git a/clang/test/Analysis/null-pointer-arithm.c b/clang/test/Analysis/null-pointer-arithm.c
new file mode 100644
index 0000000..2288247
--- /dev/null
+++ b/clang/test/Analysis/null-pointer-arithm.c
@@ -0,0 +1,76 @@
+// RUN: %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker=core
+
+extern int *get_pointer();
+
+int *test_add1(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a probably nonzero integer value (from variable 'offset') may result in undefined behavior}}
+}
+
+int *test_add2(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset) {}
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a nonzero integer value (from variable 'offset') results in undefined behavior}}
+}
+
+int *test_add3(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset != 0) return 0;
+  return p + offset;
+}
+
+int *test_add4(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset == 0) return 0;
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a nonzero integer value (from variable 'offset') results in undefined behavior}}
+}
+
+int *test_add5(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return offset + p; // expected-warning{{Addition of a probably nonzero integer value (from variable 'offset') and a null pointer (from variable 'p') may result in undefined behavior}}
+}
+
+int *test_sub1(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return p - offset; // expected-warning{{Subtraction of a null pointer (from variable 'p') and a probably nonzero integer value (from variable 'offset') may result in undefined behavior}}
+}
+
+int test_sub_p1() {
+  int *p = get_pointer();
+  if (p) {}
+  return p - p;
+}
+
+int test_sub_p2() {
+  int *p1 = get_pointer();
+  int *p2 = get_pointer();
+  if (p1) {}
+  if (p2) {}
+  return p1 - p2;
+  // expected-warning@-1{{Subtraction of a non-null pointer (from variable 'p1') and a null pointer (from variable 'p2') results in undefined behavior}}
+  // expected-warning@-2{{Subtraction of a null pointer (from variable 'p1') and a non-null pointer (from variable 'p2') results in undefined behavior}}
+}
+
+int test_sub_p3() {
+  int *p1 = get_pointer();
+  int *p2 = get_pointer();
+  if (p1) {}
+  return p1 - p2; // expected-warning{{Subtraction of a null pointer (from variable 'p1') and a probably non-null pointer (from variable 'p2') may result in undefined behavior}}
+}
+
+struct S {
+  char *p;
+  int offset;
+};
+
+char *test_struct(struct S s) {
+  if (s.p) {}
+  return s.p + s.offset; // expected-warning{{Addition of a null pointer (via field 'p') and a probably nonzero integer value (via field 'offset') may result in undefined behavior}}
+}
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 7fae958..9b32960 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -27,6 +27,7 @@
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
+// CHECK-NEXT: core.NullPointerArithm
 // CHECK-NEXT: core.StackAddressEscape
 // CHECK-NEXT: core.UndefinedBinaryOperatorResult
 // CHECK-NEXT: core.VLASize
diff --git a/clang/test/C/C2y/n3364.c b/clang/test/C/C2y/n3364.c
index f95c77f..ccf7e8d 100644
--- a/clang/test/C/C2y/n3364.c
+++ b/clang/test/C/C2y/n3364.c
@@ -37,6 +37,6 @@ double d3 = -DBL_SNAN;
 long double ld1 = LDBL_SNAN;
 long double ld2 = +LDBL_SNAN;
 long double ld3 = -LDBL_SNAN;
-// CHECK: @ld1 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000}}
-// CHECK: @ld2 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000}}
-// CHECK: @ld3 = {{.*}}global {{double 0xFFF4000000000000|x86_fp80 0xKFFFFA000000000000000|fp128 0xL0000000000000000FFFF400000000000}}
+// CHECK: @ld1 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000|ppc_fp128 0xM7FF40000000000000000000000000000}}
+// CHECK: @ld2 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000|ppc_fp128 0xM7FF40000000000000000000000000000}}
+// CHECK: @ld3 = {{.*}}global {{double 0xFFF4000000000000|x86_fp80 0xKFFFFA000000000000000|fp128 0xL0000000000000000FFFF400000000000|ppc_fp128 0xMFFF40000000000008000000000000000}}
diff --git a/clang/test/CIR/CodeGen/vla.c b/clang/test/CIR/CodeGen/vla.c
new file mode 100644
index 0000000..e2adf45
--- /dev/null
+++ b/clang/test/CIR/CodeGen/vla.c
@@ -0,0 +1,285 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+void f0(int len) {
+  int arr[len];
+}
+
+// CIR: cir.func{{.*}} @f0(%[[LEN_ARG:.*]]: !s32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[LEN_SIZE_T:.*]] = cir.cast integral %[[LEN]] : !s32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[LEN_SIZE_T]] : !u64i, ["arr"]
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+
+// LLVM: define{{.*}} void @f0(i32 %[[LEN_ARG:.*]]) {
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN_SIZE_T:.*]] = sext i32 %[[LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[ARR:.*]] = alloca i32, i64 %[[LEN_SIZE_T]]
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+// Note: VLA_EXPR0 below is emitted to capture debug info.
+
+// OGCG: define{{.*}} void @f0(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[ARR:.*]] = alloca i32, i64 %[[LEN_SIZE_T]]
+// OGCG:   store i64 %[[LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+void f1(int len) {
+  int arr[16][len];
+}
+
+// CIR: cir.func{{.*}} @f1(%[[LEN_ARG:.*]]: !s32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[SIXTEEN:.*]] = cir.const #cir.int<16> : !s32i
+// CIR:   %[[SIXTEEN_SIZE_T:.*]] = cir.cast integral %[[SIXTEEN]] : !s32i -> !u64i
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[LEN_SIZE_T:.*]] = cir.cast integral %[[LEN]] : !s32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[TOTAL_LEN:.*]] = cir.binop(mul, %[[SIXTEEN_SIZE_T]], %[[LEN_SIZE_T]]) nuw
+// CIR:   %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[TOTAL_LEN]] : !u64i, ["arr"]
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+
+// LLVM: define{{.*}} void @f1(i32 %[[LEN_ARG:.*]]) {
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN_SIZE_T:.*]] = sext i32 %[[LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[TOTAL_LEN:.*]] = mul nuw i64 16, %[[LEN_SIZE_T]]
+// LLVM:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN]]
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+// Note: VLA_EXPR0 below is emitted to capture debug info.
+
+// OGCG: define{{.*}} void @f1(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[TOTAL_LEN:.*]] = mul nuw i64 16, %[[LEN_SIZE_T]]
+// OGCG:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN]]
+// OGCG:   store i64 %[[LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+void f2(int len) {
+  int arr[len + 4];
+}
+  
+// CIR: cir.func{{.*}} @f2(%[[LEN_ARG:.*]]: !s32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[FOUR:.*]] = cir.const #cir.int<4> : !s32i
+// CIR:   %[[TOTAL_LEN:.*]] = cir.binop(add, %[[LEN]], %[[FOUR]]) nsw : !s32i
+// CIR:   %[[TOTAL_LEN_SIZE_T:.*]] = cir.cast integral %[[TOTAL_LEN]] : !s32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[TOTAL_LEN_SIZE_T]] : !u64i, ["arr"]
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+  
+// LLVM: define{{.*}} void @f2(i32 %[[LEN_ARG:.*]]) {
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[TOTAL_LEN:.*]] = add nsw i32 %[[LEN]], 4
+// LLVM:   %[[TOTAL_LEN_SIZE_T:.*]] = sext i32 %[[TOTAL_LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN_SIZE_T]]
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+  
+// Note: VLA_EXPR0 below is emitted to capture debug info.
+  
+// OGCG: define{{.*}} void @f2(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[TOTAL_LEN:.*]] = add nsw i32 %[[LEN]], 4
+// OGCG:   %[[TOTAL_LEN_SIZE_T:.*]] = zext i32 %[[TOTAL_LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN_SIZE_T]]
+// OGCG:   store i64 %[[TOTAL_LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+void f3(unsigned len) {
+  char s1[len];
+  unsigned i = 0u;
+  while (++i < len) {
+    char s2[i];
+  }
+}
+
+// CIR: cir.func{{.*}} @f3(%[[LEN_ARG:.*]]: !u32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[LEN_SIZE_T:.*]] = cir.cast integral %[[LEN]] : !u32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[S1:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, %[[LEN_SIZE_T]] : !u64i, ["s1"]
+// CIR:   %[[I:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["i", init]
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[I]]
+// CIR:   cir.scope {
+// CIR:     cir.while {
+// CIR:     %[[CUR_I:.*]] = cir.load{{.*}} %[[I]]
+// CIR:     %[[NEXT:.*]] = cir.unary(inc, %[[CUR_I]])
+// CIR:     cir.store{{.*}} %[[NEXT]], %[[I]]
+// CIR:     %[[LEN2:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:     %[[CMP:.*]] = cir.cmp(lt, %[[NEXT]], %[[LEN2]])
+// CIR:     cir.condition(%[[CMP]])
+// CIR:   } do {
+// CIR:       cir.scope {
+// CIR:         %[[SAVED_STACK2:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:         %[[I_LEN:.*]] = cir.load{{.*}} %[[I]]
+// CIR:         %[[I_LEN_SIZE_T2:.*]] = cir.cast integral %[[I_LEN]] : !u32i -> !u64i
+// CIR:         %[[STACK_PTR2:.*]] = cir.stacksave
+// CIR:         cir.store{{.*}} %[[STACK_PTR2]], %[[SAVED_STACK2]]
+// CIR:         %[[S2:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, %[[I_LEN_SIZE_T2]] : !u64i, ["s2"]
+// CIR:         %[[SAVED_RESTORE_PTR2:.*]] = cir.load{{.*}} %[[SAVED_STACK2]]
+// CIR:         cir.stackrestore %[[SAVED_RESTORE_PTR2]]
+// CIR:       }
+// CIR:       cir.yield
+// CIR:     }
+// CIR:   }
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+
+// LLVM: define{{.*}} void @f3(i32 %[[LEN_ARG:.*]]) {
+// LLVM:   %[[SAVED_STACK2:.*]] = alloca ptr
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[S1:.*]] = alloca i8, i64 %[[LEN_SIZE_T]]
+// LLVM:   %[[I:.*]] = alloca i32
+// LLVM:   store i32 0, ptr %[[I]]
+// LLVM:   br label %[[WHILE_START:.*]]
+// LLVM: [[WHILE_START]]:
+// LLVM:   br label %[[WHILE_COND:.*]]
+// LLVM: [[WHILE_COND]]:
+// LLVM:   %[[CUR_I:.*]] = load i32, ptr %[[I]]
+// LLVM:   %[[NEXT:.*]] = add i32 %[[CUR_I]], 1
+// LLVM:   store i32 %[[NEXT]], ptr %[[I]]
+// LLVM:   %[[LEN2:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[CMP:.*]] = icmp ult i32 %[[NEXT]], %[[LEN2]]
+// LLVM:   br i1 %[[CMP]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
+// LLVM: [[WHILE_BODY]]:
+// LLVM:   br label %[[WHILE_BODY2:.*]]
+// LLVM: [[WHILE_BODY2]]:
+// LLVM:   %[[I_LEN:.*]] = load i32, ptr %[[I]]
+// LLVM:   %[[I_LEN_SIZE_T2:.*]] = zext i32 %[[I_LEN]] to i64
+// LLVM:   %[[STACK_PTR2:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR2]], ptr %[[SAVED_STACK2]]
+// LLVM:   %[[S2:.*]] = alloca i8, i64 %[[I_LEN_SIZE_T2]]
+// LLVM:   %[[STACK_RESTORE_PTR2:.*]] = load ptr, ptr %[[SAVED_STACK2]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR2]])
+// LLVM:   br label %[[WHILE_BODY_END:.*]]
+// LLVM: [[WHILE_BODY_END]]:
+// LLVM:   br label %[[WHILE_COND]]
+// LLVM: [[WHILE_END]]:
+// LLVM:   br label %[[F3_END:.*]]
+// LLVM: [[F3_END]]:
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+// Note: VLA_EXPR0 and VLA_EXPR1 below are emitted to capture debug info.
+
+// OGCG: define{{.*}} void @f3(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   %[[I:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK1:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR1:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[S1:.*]] = alloca i8, i64 %[[LEN_SIZE_T]]
+// OGCG:   store i64 %[[LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   br label %[[WHILE_COND:.*]]
+// OGCG: [[WHILE_COND]]:
+// OGCG:   %[[CUR_I:.*]] = load i32, ptr %[[I]]
+// OGCG:   %[[NEXT:.*]] = add i32 %[[CUR_I]], 1
+// OGCG:   store i32 %[[NEXT]], ptr %[[I]]
+// OGCG:   %[[LEN2:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[CMP:.*]] = icmp ult i32 %[[NEXT]], %[[LEN2]]
+// OGCG:   br i1 %[[CMP]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
+// OGCG: [[WHILE_BODY]]:
+// OGCG:   %[[I_LEN:.*]] = load i32, ptr %[[I]]
+// OGCG:   %[[I_LEN_SIZE_T:.*]] = zext i32 %[[I_LEN]] to i64
+// OGCG:   %[[STACK_PTR1:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR1]], ptr %[[SAVED_STACK1]]
+// OGCG:   %[[S2:.*]] = alloca i8, i64 %[[I_LEN_SIZE_T]]
+// OGCG:   store i64 %[[I_LEN_SIZE_T]], ptr %[[VLA_EXPR1]]
+// OGCG:   %[[STACK_RESTORE_PTR1:.*]] = load ptr, ptr %[[SAVED_STACK1]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR1]])
+// OGCG:   br label %[[WHILE_COND]]
+// OGCG: [[WHILE_END]]:
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+
+// The following test case is disabled because it runs into a bug (unrelated
+// to VLA) in the handling of cleanups in loops with break statements.
+//
+// void f4(unsigned len) {
+//     char s1[len];
+//     while (1) {
+//       char s2[len];
+//       if (1)
+//         break;
+//     }
+// }
+  
+\ No newline at end of file
diff --git a/clang/test/CXX/module/module.import/p6.cpp b/clang/test/CXX/module/module.import/p6.cpp
index cb2d799..9e378a5 100644
--- a/clang/test/CXX/module/module.import/p6.cpp
+++ b/clang/test/CXX/module/module.import/p6.cpp
@@ -3,6 +3,9 @@
 
 // RUN: %clang_cc1 -std=c++20 -x c++-header %t/bad-header-unit.h \
 // RUN:  -emit-header-unit -o %t/bad-header-unit.pcm -verify
+// RUN: %clang_cc1 -std=c++20 -x c++-header %t/bad-header-unit-declspec.h \
+// RUN:  -emit-header-unit -o %t/bad-header-unit.pcm -verify \
+// RUN:  -fdeclspec
 
 //--- bad-header-unit.h
 
@@ -77,3 +80,13 @@ template <typename T> bool b() {
 }
 
 inline bool B = b<int>();
+
+__attribute__((weak)) int weak_fun_definition() { return 42; }
+
+__attribute__((weak)) int weak_var_definition = 42;
+
+//--- bad-header-unit-declspec.h
+
+/* The cases below should compile without diagnostics.  */
+
+__declspec(selectany) int selectany_var_definition = 42; // expected-no-diagnostics
diff --git a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
index 4fb977a..e40b2d7 100644
--- a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
+++ b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=ARM-DARWIN
 // RUN: %clang_cc1 -triple arm-unknown-gnueabi -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=ARM-EABI
 // RUN: %clang_cc1 -triple mipsel-unknown-unknown -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=MIPS
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -fcxx-exceptions -fexceptions -exception-model=seh %s -O2 -o - | FileCheck %s --check-prefix=MINGW-X86-64
+// RUN: %clang_cc1 -triple thumbv7-windows-gnu -emit-llvm -fcxx-exceptions -fexceptions -exception-model=seh %s -O2 -o - | FileCheck %s --check-prefix=MINGW-ARMV7
 
 void foo();
 void test() {
@@ -25,9 +27,15 @@ void test() {
 // ARM-EABI-NEXT:   [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 88
 // MIPS:            [[T0:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // MIPS-NEXT:       [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 24
+// MINGW-X86-64:     [[T0:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// MINGW-X86-64-NEXT:[[T1:%.*]] = getelementptr i8, ptr [[EXN]], i64 64
+// MINGW-ARMV7:      [[T0:%.*]] = tail call arm_aapcs_vfpcc ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// MINGW-ARMV7-NEXT: [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 48
 
 // X86-64: attributes [[NUW]] = { nounwind }
 // X86-32: attributes [[NUW]] = { nounwind }
 // ARM-DARWIN: attributes [[NUW]] = { nounwind }
 // ARM-EABI: attributes [[NUW]] = { nounwind }
 // MIPS: attributes [[NUW]] = { nounwind }
+// MINGW-X86-64: attributes [[NUW]] = { nounwind }
+// MINGW-ARMV7: attributes [[NUW]] = { nounwind }
diff --git a/clang/test/CodeGenHLSL/vk-features/maximal_reconvergence.hlsl b/clang/test/CodeGenHLSL/vk-features/maximal_reconvergence.hlsl
new file mode 100644
index 0000000..f23ac7c
--- /dev/null
+++ b/clang/test/CodeGenHLSL/vk-features/maximal_reconvergence.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple spirv1.6-unknown-vulkan1.3-compute -fspv-enable-maximal-reconvergence -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -triple spirv1.6-unknown-vulkan1.3-compute -hlsl-entry test -fspv-enable-maximal-reconvergence -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=CHECK-ENTRY
+
+[numthreads(1,1,1)]
+void main() {
+// CHECK: define void @main() [[attributeNumber:#[0-9]+]] {
+}
+
+// CHECK: attributes [[attributeNumber]] = {{.*}} "enable-maximal-reconvergence"="true" {{.*}}
+
+
+[numthreads(1,1,1)]
+void test() {
+// CHECK-ENTRY: define void @test() [[attributeNumber:#[0-9]+]] {
+}
+    
+// CHECK-ENTRY: attributes [[attributeNumber]] = {{.*}} "enable-maximal-reconvergence"="true" {{.*}}
diff --git a/clang/test/DebugInfo/CXX/versioned-language.cpp b/clang/test/DebugInfo/CXX/versioned-language.cpp
new file mode 100644
index 0000000..4cb2b29
--- /dev/null
+++ b/clang/test/DebugInfo/CXX/versioned-language.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 -std=c++98 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++98 | FileCheck %s --check-prefix=CHECK-CPP98
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++03 | FileCheck %s --check-prefix=CHECK-CPP03
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++11 | FileCheck %s --check-prefix=CHECK-CPP11
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++14 | FileCheck %s --check-prefix=CHECK-CPP14
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++17 | FileCheck %s --check-prefix=CHECK-CPP17
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++20 | FileCheck %s --check-prefix=CHECK-CPP20
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++23 | FileCheck %s --check-prefix=CHECK-CPP23
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++2c | FileCheck %s --check-prefix=CHECK-CPP2C
+
+struct Foo {} globalVar;
+
+// CHECK-CPP98:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 199711
+// FIXME: C++03 technically has no official standard version code. From Clang's point of view C++03 and C++98 are interchangable.
+// CHECK-CPP03:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 199711
+// CHECK-CPP11:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201103
+// CHECK-CPP14:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201402
+// CHECK-CPP17:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201703
+// CHECK-CPP20:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202002
+// CHECK-CPP23:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202302
+// CHECK-CPP2C:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202400
diff --git a/clang/test/DebugInfo/Generic/versioned-language.c b/clang/test/DebugInfo/Generic/versioned-language.c
new file mode 100644
index 0000000..1faa7b4
--- /dev/null
+++ b/clang/test/DebugInfo/Generic/versioned-language.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 -std=c99 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c89 | FileCheck %s --check-prefix=CHECK-C89 --implicit-check-not "sourceLanguageVersion"
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c99 | FileCheck %s --check-prefix=CHECK-C99
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c17 | FileCheck %s --check-prefix=CHECK-C17
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c23 | FileCheck %s --check-prefix=CHECK-C23
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c2y | FileCheck %s --check-prefix=CHECK-C2Y
+
+int globalVar = 10;
+
+// CHECK-C89: !DICompileUnit(sourceLanguageName: DW_LNAME_C,
+// CHECK-C99: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 199901
+// CHECK-C11: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 201112
+// CHECK-C17: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 201710
+// CHECK-C23: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 202311
+// CHECK-C2Y: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 202400
diff --git a/clang/test/DebugInfo/ObjC/versioned-language.m b/clang/test/DebugInfo/ObjC/versioned-language.m
new file mode 100644
index 0000000..178c47b
--- /dev/null
+++ b/clang/test/DebugInfo/ObjC/versioned-language.m
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageVersion" --check-prefix=CHECK-OBJC
+
+int globalVar = 10;
+
+// CHECK-OBJC: !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC,
diff --git a/clang/test/DebugInfo/ObjCXX/versioned-language.mm b/clang/test/DebugInfo/ObjCXX/versioned-language.mm
new file mode 100644
index 0000000..bfdce46
--- /dev/null
+++ b/clang/test/DebugInfo/ObjCXX/versioned-language.mm
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageVersion" --check-prefix=CHECK-OBJCXX
+
+int globalVar = 10;
+
+// CHECK-OBJCXX: !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus,
diff --git a/compiler-rt/lib/asan/asan_fake_stack.cpp b/compiler-rt/lib/asan/asan_fake_stack.cpp
index c3ed252..d3fa953 100644
--- a/compiler-rt/lib/asan/asan_fake_stack.cpp
+++ b/compiler-rt/lib/asan/asan_fake_stack.cpp
@@ -28,7 +28,7 @@ static const u64 kAllocaRedzoneMask = 31UL;
 // For small size classes inline PoisonShadow for better performance.
 ALWAYS_INLINE void SetShadow(uptr ptr, uptr size, uptr class_id, u64 magic) {
   CHECK(AddrIsAlignedByGranularity(ptr + size));
-  u64 *shadow = reinterpret_cast<u64*>(MemToShadow(ptr));
+  u64* shadow = reinterpret_cast<u64*>(MemToShadow(ptr));
   if (ASAN_SHADOW_SCALE == 3 && class_id <= 6) {
     // This code expects ASAN_SHADOW_SCALE=3.
     for (uptr i = 0; i < (((uptr)1) << class_id); i++) {
@@ -47,7 +47,7 @@ ALWAYS_INLINE void SetShadow(uptr ptr, uptr size, uptr class_id, u64 magic) {
   }
 }
 
-FakeStack *FakeStack::Create(uptr stack_size_log) {
+FakeStack* FakeStack::Create(uptr stack_size_log) {
   static uptr kMinStackSizeLog = 16;
   static uptr kMaxStackSizeLog = FIRST_32_SECOND_64(24, 28);
   if (stack_size_log < kMinStackSizeLog)
@@ -57,7 +57,7 @@ FakeStack *FakeStack::Create(uptr stack_size_log) {
   CHECK_LE(kMaxStackFrameSizeLog, stack_size_log);
   uptr size = RequiredSize(stack_size_log);
   uptr padded_size = size + kMaxStackFrameSize;
-  void *true_res = reinterpret_cast<void *>(
+  void* true_res = reinterpret_cast<void*>(
       flags()->uar_noreserve ? MmapNoReserveOrDie(padded_size, "FakeStack")
                              : MmapOrDie(padded_size, "FakeStack"));
   // GetFrame() requires the property that
@@ -66,20 +66,20 @@ FakeStack *FakeStack::Create(uptr stack_size_log) {
   // We didn't use MmapAlignedOrDieOnFatalError, because it requires that the
   // *size* is a power of 2, which is an overly strong condition.
   static_assert(alignof(FakeStack) <= kMaxStackFrameSize);
-  FakeStack *res = reinterpret_cast<FakeStack *>(
+  FakeStack* res = reinterpret_cast<FakeStack*>(
       RoundUpTo(
           (uptr)true_res + kFlagsOffset + SizeRequiredForFlags(stack_size_log),
           kMaxStackFrameSize) -
       kFlagsOffset - SizeRequiredForFlags(stack_size_log));
   res->true_start = true_res;
   res->stack_size_log_ = stack_size_log;
-  u8 *p = reinterpret_cast<u8 *>(res);
+  u8* p = reinterpret_cast<u8*>(res);
   VReport(1,
           "T%d: FakeStack created: %p -- %p stack_size_log: %zd; "
           "mmapped %zdK, noreserve=%d, true_start: %p, start of first frame: "
           "0x%zx\n",
-          GetCurrentTidOrInvalid(), (void *)p,
-          (void *)(p + FakeStack::RequiredSize(stack_size_log)), stack_size_log,
+          GetCurrentTidOrInvalid(), (void*)p,
+          (void*)(p + FakeStack::RequiredSize(stack_size_log)), stack_size_log,
           size >> 10, flags()->uar_noreserve, res->true_start,
           res->GetFrame(stack_size_log, /*class_id*/ 0, /*pos*/ 0));
   return res;
@@ -109,14 +109,14 @@ void FakeStack::PoisonAll(u8 magic) {
 #if !defined(_MSC_VER) || defined(__clang__)
 ALWAYS_INLINE USED
 #endif
-FakeFrame *FakeStack::Allocate(uptr stack_size_log, uptr class_id,
-                               uptr real_stack) {
+    FakeFrame* FakeStack::Allocate(uptr stack_size_log, uptr class_id,
+                                   uptr real_stack) {
   CHECK_LT(class_id, kNumberOfSizeClasses);
   if (needs_gc_)
     GC(real_stack);
-  uptr &hint_position = hint_position_[class_id];
+  uptr& hint_position = hint_position_[class_id];
   const int num_iter = NumberOfFrames(stack_size_log, class_id);
-  u8 *flags = GetFlags(stack_size_log, class_id);
+  u8* flags = GetFlags(stack_size_log, class_id);
   for (int i = 0; i < num_iter; i++) {
     uptr pos = ModuloNumberOfFrames(stack_size_log, class_id, hint_position++);
     // This part is tricky. On one hand, checking and setting flags[pos]
@@ -126,22 +126,24 @@ FakeFrame *FakeStack::Allocate(uptr stack_size_log, uptr class_id,
     // and so will not touch this particular byte. So, it is safe to do this
     // with regular non-atomic load and store (at least I was not able to make
     // this code crash).
-    if (flags[pos]) continue;
+    if (flags[pos])
+      continue;
     flags[pos] = 1;
-    FakeFrame *res = reinterpret_cast<FakeFrame *>(
-        GetFrame(stack_size_log, class_id, pos));
+    FakeFrame* res =
+        reinterpret_cast<FakeFrame*>(GetFrame(stack_size_log, class_id, pos));
     res->real_stack = real_stack;
     *SavedFlagPtr(reinterpret_cast<uptr>(res), class_id) = &flags[pos];
     return res;
   }
-  return nullptr; // We are out of fake stack.
+  return nullptr;  // We are out of fake stack.
 }
 
-uptr FakeStack::AddrIsInFakeStack(uptr ptr, uptr *frame_beg, uptr *frame_end) {
+uptr FakeStack::AddrIsInFakeStack(uptr ptr, uptr* frame_beg, uptr* frame_end) {
   uptr stack_size_log = this->stack_size_log();
   uptr beg = reinterpret_cast<uptr>(GetFrame(stack_size_log, 0, 0));
   uptr end = reinterpret_cast<uptr>(this) + RequiredSize(stack_size_log);
-  if (ptr < beg || ptr >= end) return 0;
+  if (ptr < beg || ptr >= end)
+    return 0;
   uptr class_id = (ptr - beg) >> stack_size_log;
   uptr base = beg + (class_id << stack_size_log);
   CHECK_LE(base, ptr);
@@ -153,9 +155,7 @@ uptr FakeStack::AddrIsInFakeStack(uptr ptr, uptr *frame_beg, uptr *frame_end) {
   return res;
 }
 
-void FakeStack::HandleNoReturn() {
-  needs_gc_ = true;
-}
+void FakeStack::HandleNoReturn() { needs_gc_ = true; }
 
 // Hack: The statement below is not true if we take into account sigaltstack or
 // makecontext. It should be possible to make GC to discard wrong stack frame if
@@ -170,7 +170,7 @@ void FakeStack::HandleNoReturn() {
 // We do it based on their 'real_stack' values -- everything that is lower
 // than the current real_stack is garbage.
 NOINLINE void FakeStack::GC(uptr real_stack) {
-  AsanThread *curr_thread = GetCurrentThread();
+  AsanThread* curr_thread = GetCurrentThread();
   if (!curr_thread)
     return;  // Try again when we have a thread.
   auto top = curr_thread->stack_top();
@@ -179,12 +179,13 @@ NOINLINE void FakeStack::GC(uptr real_stack) {
     return;  // Not the default stack.
 
   for (uptr class_id = 0; class_id < kNumberOfSizeClasses; class_id++) {
-    u8 *flags = GetFlags(stack_size_log(), class_id);
+    u8* flags = GetFlags(stack_size_log(), class_id);
     for (uptr i = 0, n = NumberOfFrames(stack_size_log(), class_id); i < n;
          i++) {
-      if (flags[i] == 0) continue;  // not allocated.
-      FakeFrame *ff = reinterpret_cast<FakeFrame *>(
-          GetFrame(stack_size_log(), class_id, i));
+      if (flags[i] == 0)
+        continue;  // not allocated.
+      FakeFrame* ff =
+          reinterpret_cast<FakeFrame*>(GetFrame(stack_size_log(), class_id, i));
       // GC only on the default stack.
       if (bottom < ff->real_stack && ff->real_stack < real_stack) {
         flags[i] = 0;
@@ -197,14 +198,15 @@ NOINLINE void FakeStack::GC(uptr real_stack) {
   needs_gc_ = false;
 }
 
-void FakeStack::ForEachFakeFrame(RangeIteratorCallback callback, void *arg) {
+void FakeStack::ForEachFakeFrame(RangeIteratorCallback callback, void* arg) {
   for (uptr class_id = 0; class_id < kNumberOfSizeClasses; class_id++) {
-    u8 *flags = GetFlags(stack_size_log(), class_id);
+    u8* flags = GetFlags(stack_size_log(), class_id);
     for (uptr i = 0, n = NumberOfFrames(stack_size_log(), class_id); i < n;
          i++) {
-      if (flags[i] == 0) continue;  // not allocated.
-      FakeFrame *ff = reinterpret_cast<FakeFrame *>(
-          GetFrame(stack_size_log(), class_id, i));
+      if (flags[i] == 0)
+        continue;  // not allocated.
+      FakeFrame* ff =
+          reinterpret_cast<FakeFrame*>(GetFrame(stack_size_log(), class_id, i));
       uptr begin = reinterpret_cast<uptr>(ff);
       callback(begin, begin + FakeStack::BytesInSizeClass(class_id), arg);
     }
@@ -212,44 +214,51 @@ void FakeStack::ForEachFakeFrame(RangeIteratorCallback callback, void *arg) {
 }
 
 #if (SANITIZER_LINUX && !SANITIZER_ANDROID) || SANITIZER_FUCHSIA
-static THREADLOCAL FakeStack *fake_stack_tls;
+static THREADLOCAL FakeStack* fake_stack_tls;
 
-FakeStack *GetTLSFakeStack() {
-  return fake_stack_tls;
-}
-void SetTLSFakeStack(FakeStack *fs) {
-  fake_stack_tls = fs;
-}
+static FakeStack* GetTLSFakeStack() { return fake_stack_tls; }
+static void SetTLSFakeStack(FakeStack* fs) { fake_stack_tls = fs; }
+void ResetTLSFakeStack() { fake_stack_tls = nullptr; }
 #else
-FakeStack *GetTLSFakeStack() { return 0; }
-void SetTLSFakeStack(FakeStack *fs) { }
+static FakeStack* GetTLSFakeStack() { return nullptr; }
+static void SetTLSFakeStack(FakeStack*) {}
+void ResetTLSFakeStack() {}
 #endif  // (SANITIZER_LINUX && !SANITIZER_ANDROID) || SANITIZER_FUCHSIA
 
-static FakeStack *GetFakeStack() {
-  AsanThread *t = GetCurrentThread();
-  if (!t) return nullptr;
+static FakeStack* GetFakeStack() {
+  AsanThread* t = GetCurrentThread();
+  if (!t)
+    return nullptr;
   return t->get_or_create_fake_stack();
 }
 
-static FakeStack *GetFakeStackFast() {
-  if (FakeStack *fs = GetTLSFakeStack())
+static FakeStack* GetFakeStackFast() {
+  FakeStack* fs = GetTLSFakeStack();
+  if (LIKELY(fs))
     return fs;
   if (!__asan_option_detect_stack_use_after_return)
     return nullptr;
-  return GetFakeStack();
+  fs = GetFakeStack();
+  if (LIKELY(fs))
+    SetTLSFakeStack(fs);
+  return fs;
 }
 
-static FakeStack *GetFakeStackFastAlways() {
-  if (FakeStack *fs = GetTLSFakeStack())
+static FakeStack* GetFakeStackFastAlways() {
+  FakeStack* fs = GetTLSFakeStack();
+  if (LIKELY(fs))
     return fs;
-  return GetFakeStack();
+  fs = GetFakeStack();
+  if (LIKELY(fs))
+    SetTLSFakeStack(fs);
+  return fs;
 }
 
 static ALWAYS_INLINE uptr OnMalloc(uptr class_id, uptr size) {
-  FakeStack *fs = GetFakeStackFast();
+  FakeStack* fs = GetFakeStackFast();
   if (!fs)
     return 0;
-  FakeFrame *ff =
+  FakeFrame* ff =
       fs->Allocate(fs->stack_size_log(), class_id, GET_CURRENT_FRAME());
   if (!ff)
     return 0;  // Out of fake stack.
@@ -259,10 +268,10 @@ static ALWAYS_INLINE uptr OnMalloc(uptr class_id, uptr size) {
 }
 
 static ALWAYS_INLINE uptr OnMallocAlways(uptr class_id, uptr size) {
-  FakeStack *fs = GetFakeStackFastAlways();
+  FakeStack* fs = GetFakeStackFastAlways();
   if (!fs)
     return 0;
-  FakeFrame *ff =
+  FakeFrame* ff =
       fs->Allocate(fs->stack_size_log(), class_id, GET_CURRENT_FRAME());
   if (!ff)
     return 0;  // Out of fake stack.
@@ -276,17 +285,17 @@ static ALWAYS_INLINE void OnFree(uptr ptr, uptr class_id, uptr size) {
   SetShadow(ptr, size, class_id, kMagic8);
 }
 
-} // namespace __asan
+}  // namespace __asan
 
 // ---------------------- Interface ---------------- {{{1
 using namespace __asan;
 #define DEFINE_STACK_MALLOC_FREE_WITH_CLASS_ID(class_id)                      \
   extern "C" SANITIZER_INTERFACE_ATTRIBUTE uptr                               \
-      __asan_stack_malloc_##class_id(uptr size) {                             \
+  __asan_stack_malloc_##class_id(uptr size) {                                 \
     return OnMalloc(class_id, size);                                          \
   }                                                                           \
   extern "C" SANITIZER_INTERFACE_ATTRIBUTE uptr                               \
-      __asan_stack_malloc_always_##class_id(uptr size) {                      \
+  __asan_stack_malloc_always_##class_id(uptr size) {                          \
     return OnMallocAlways(class_id, size);                                    \
   }                                                                           \
   extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __asan_stack_free_##class_id( \
@@ -311,21 +320,25 @@ extern "C" {
 // -asan-use-after-return=never, after modal UAR flag lands
 // (https://github.com/google/sanitizers/issues/1394)
 SANITIZER_INTERFACE_ATTRIBUTE
-void *__asan_get_current_fake_stack() { return GetFakeStackFast(); }
+void* __asan_get_current_fake_stack() { return GetFakeStackFast(); }
 
 SANITIZER_INTERFACE_ATTRIBUTE
-void *__asan_addr_is_in_fake_stack(void *fake_stack, void *addr, void **beg,
-                                   void **end) {
-  FakeStack *fs = reinterpret_cast<FakeStack*>(fake_stack);
-  if (!fs) return nullptr;
+void* __asan_addr_is_in_fake_stack(void* fake_stack, void* addr, void** beg,
+                                   void** end) {
+  FakeStack* fs = reinterpret_cast<FakeStack*>(fake_stack);
+  if (!fs)
+    return nullptr;
   uptr frame_beg, frame_end;
-  FakeFrame *frame = reinterpret_cast<FakeFrame *>(fs->AddrIsInFakeStack(
+  FakeFrame* frame = reinterpret_cast<FakeFrame*>(fs->AddrIsInFakeStack(
       reinterpret_cast<uptr>(addr), &frame_beg, &frame_end));
-  if (!frame) return nullptr;
+  if (!frame)
+    return nullptr;
   if (frame->magic != kCurrentStackFrameMagic)
     return nullptr;
-  if (beg) *beg = reinterpret_cast<void*>(frame_beg);
-  if (end) *end = reinterpret_cast<void*>(frame_end);
+  if (beg)
+    *beg = reinterpret_cast<void*>(frame_beg);
+  if (end)
+    *end = reinterpret_cast<void*>(frame_end);
   return reinterpret_cast<void*>(frame->real_stack);
 }
 
@@ -344,9 +357,9 @@ void __asan_alloca_poison(uptr addr, uptr size) {
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __asan_allocas_unpoison(uptr top, uptr bottom) {
-  if ((!top) || (top > bottom)) return;
-  REAL(memset)
-  (reinterpret_cast<void *>(MemToShadow(top)), 0,
-   (bottom - top) / ASAN_SHADOW_GRANULARITY);
+  if ((!top) || (top > bottom))
+    return;
+  REAL(memset)(reinterpret_cast<void*>(MemToShadow(top)), 0,
+               (bottom - top) / ASAN_SHADOW_GRANULARITY);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/compiler-rt/lib/asan/asan_fake_stack.h b/compiler-rt/lib/asan/asan_fake_stack.h
index 50706e6..593c137 100644
--- a/compiler-rt/lib/asan/asan_fake_stack.h
+++ b/compiler-rt/lib/asan/asan_fake_stack.h
@@ -195,8 +195,7 @@ class FakeStack {
   void *true_start;
 };
 
-FakeStack *GetTLSFakeStack();
-void SetTLSFakeStack(FakeStack *fs);
+void ResetTLSFakeStack();
 
 }  // namespace __asan
 
diff --git a/compiler-rt/lib/asan/asan_thread.cpp b/compiler-rt/lib/asan/asan_thread.cpp
index 2627ae1..0ed58bb 100644
--- a/compiler-rt/lib/asan/asan_thread.cpp
+++ b/compiler-rt/lib/asan/asan_thread.cpp
@@ -163,7 +163,7 @@ void AsanThread::StartSwitchFiber(FakeStack **fake_stack_save, uptr bottom,
   if (fake_stack_save)
     *fake_stack_save = fake_stack_;
   fake_stack_ = nullptr;
-  SetTLSFakeStack(nullptr);
+  ResetTLSFakeStack();
   // if fake_stack_save is null, the fiber will die, delete the fakestack
   if (!fake_stack_save && current_fake_stack)
     current_fake_stack->Destroy(this->tid());
@@ -177,8 +177,8 @@ void AsanThread::FinishSwitchFiber(FakeStack *fake_stack_save, uptr *bottom_old,
   }
 
   if (fake_stack_save) {
-    SetTLSFakeStack(fake_stack_save);
     fake_stack_ = fake_stack_save;
+    ResetTLSFakeStack();
   }
 
   if (bottom_old)
@@ -242,7 +242,7 @@ FakeStack *AsanThread::AsyncSignalSafeLazyInitFakeStack() {
         Max(stack_size_log, static_cast<uptr>(flags()->min_uar_stack_size_log));
     fake_stack_ = FakeStack::Create(stack_size_log);
     DCHECK_EQ(GetCurrentThread(), this);
-    SetTLSFakeStack(fake_stack_);
+    ResetTLSFakeStack();
     return fake_stack_;
   }
   return nullptr;
diff --git a/compiler-rt/lib/asan/asan_thread.h b/compiler-rt/lib/asan/asan_thread.h
index 12f0cc7..19b7f34 100644
--- a/compiler-rt/lib/asan/asan_thread.h
+++ b/compiler-rt/lib/asan/asan_thread.h
@@ -104,7 +104,7 @@ class AsanThread {
     if (!fake_stack_) return;
     FakeStack *t = fake_stack_;
     fake_stack_ = nullptr;
-    SetTLSFakeStack(nullptr);
+    ResetTLSFakeStack();
     t->Destroy(tid);
   }
 
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index d8d0956..20a0919 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1289,16 +1289,7 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
-inline bool IsCUDADeviceSymbol(const Symbol &sym) {
-  if (const auto *details =
-          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
-    if (details->cudaDataAttr() &&
-        *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-      return true;
-    }
-  }
-  return false;
-}
+bool IsCUDADeviceSymbol(const Symbol &sym);
 
 inline bool IsCUDAManagedOrUnifiedSymbol(const Symbol &sym) {
   if (const auto *details =
diff --git a/flang/include/flang/Optimizer/CMakeLists.txt b/flang/include/flang/Optimizer/CMakeLists.txt
index 3336ac9..68af52f 100644
--- a/flang/include/flang/Optimizer/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(CodeGen)
 add_subdirectory(Dialect)
 add_subdirectory(HLFIR)
 add_subdirectory(Transforms)
+add_subdirectory(OpenACC)
 add_subdirectory(OpenMP)
diff --git a/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
new file mode 100644
index 0000000..a032488
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name FIROpenACC)
+
+add_public_tablegen_target(FIROpenACCPassesIncGen)
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h
new file mode 100644
index 0000000..0627cc8
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.h
@@ -0,0 +1,33 @@
+//===- Passes.h - OpenACC pass entry points -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header declares the OpenACC passes specific to Fortran and FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include <memory>
+
+namespace fir {
+namespace acc {
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+
+std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass();
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES_H
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td
new file mode 100644
index 0000000..3c127b3
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.td
@@ -0,0 +1,36 @@
+//===-- Passes.td - flang OpenACC pass definitions -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def ACCRecipeBufferization
+    : Pass<"fir-acc-recipe-bufferization", "mlir::ModuleOp"> {
+  let summary = "Rewrite acc.*.recipe box values to ref<box> and update uses";
+  let description = [{
+    Bufferizes OpenACC recipes that operate on fir.box<T> so their type and
+    region block arguments become fir.ref<fir.box<T>> instead. This applies to
+    acc.private.recipe, acc.firstprivate.recipe (including copy region), and
+    acc.reduction.recipe (including combiner region).
+
+    For affected regions, the pass inserts required loads at the beginning of
+    the region to preserve original uses after argument type changes. For yields
+    of box values, the pass allocates a local fir.ref<fir.box<T>> and stores the
+    yielded fir.box<T> into it so the region yields a reference to a box.
+
+    For acc.private, acc.firstprivate, and acc.reduction operations that use a
+    bufferized recipe, the pass allocates a host-side fir.ref<fir.box<T>> before
+    the data op and rewires the data op to use the new memory. Other users of
+    the original data operation result (outside the paired compute op) are
+    updated to load through the reference.
+  }];
+}
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index f204eef..1de5e6b 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -111,6 +111,7 @@ ENUM_CLASS(KindCode, none, defaultIntegerKind,
     atomicIntKind, // atomic_int_kind from iso_fortran_env
     atomicIntOrLogicalKind, // atomic_int_kind or atomic_logical_kind
     sameAtom, // same type and kind as atom
+    extensibleOrUnlimitedType, // extensible or unlimited polymorphic type
 )
 
 struct TypePattern {
@@ -160,7 +161,8 @@ static constexpr TypePattern AnyChar{CharType, KindCode::any};
 static constexpr TypePattern AnyLogical{LogicalType, KindCode::any};
 static constexpr TypePattern AnyRelatable{RelatableType, KindCode::any};
 static constexpr TypePattern AnyIntrinsic{IntrinsicType, KindCode::any};
-static constexpr TypePattern ExtensibleDerived{DerivedType, KindCode::any};
+static constexpr TypePattern ExtensibleDerived{
+    DerivedType, KindCode::extensibleOrUnlimitedType};
 static constexpr TypePattern AnyData{AnyType, KindCode::any};
 
 // Type is irrelevant, but not BOZ (for PRESENT(), OPTIONAL(), &c.)
@@ -2103,9 +2105,13 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
       }
       return std::nullopt;
     } else if (!d.typePattern.categorySet.test(type->category())) {
+      const char *expected{
+          d.typePattern.kindCode == KindCode::extensibleOrUnlimitedType
+              ? ", expected extensible or unlimited polymorphic type"
+              : ""};
       messages.Say(arg->sourceLocation(),
-          "Actual argument for '%s=' has bad type '%s'"_err_en_US, d.keyword,
-          type->AsFortran());
+          "Actual argument for '%s=' has bad type '%s'%s"_err_en_US, d.keyword,
+          type->AsFortran(), expected);
       return std::nullopt; // argument has invalid type category
     }
     bool argOk{false};
@@ -2244,6 +2250,17 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
         return std::nullopt;
       }
       break;
+    case KindCode::extensibleOrUnlimitedType:
+      argOk = type->IsUnlimitedPolymorphic() ||
+          (type->category() == TypeCategory::Derived &&
+              IsExtensibleType(GetDerivedTypeSpec(type)));
+      if (!argOk) {
+        messages.Say(arg->sourceLocation(),
+            "Actual argument for '%s=' has type '%s', but was expected to be an extensible or unlimited polymorphic type"_err_en_US,
+            d.keyword, type->AsFortran());
+        return std::nullopt;
+      }
+      break;
     default:
       CRASH_NO_CASE;
     }
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index b927fa3..bd06acc 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1153,6 +1153,18 @@ bool HasCUDAImplicitTransfer(const Expr<SomeType> &expr) {
   return (hasConstant || (hostSymbols.size() > 0)) && deviceSymbols.size() > 0;
 }
 
+bool IsCUDADeviceSymbol(const Symbol &sym) {
+  if (const auto *details =
+          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+    return details->cudaDataAttr() &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Pinned;
+  } else if (const auto *details =
+                 sym.GetUltimate().detailsIf<semantics::AssocEntityDetails>()) {
+    return GetNbOfCUDADeviceSymbols(details->expr()) > 0;
+  }
+  return false;
+}
+
 // HasVectorSubscript()
 struct HasVectorSubscriptHelper
     : public AnyTraverse<HasVectorSubscriptHelper, bool,
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 460ed62..e7a6c4d 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -629,6 +629,10 @@ private:
     unsigned allocatorIdx = Fortran::lower::getAllocatorIdx(alloc.getSymbol());
     fir::ExtendedValue exv = isSource ? sourceExv : moldExv;
 
+    if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)})
+      if (Fortran::semantics::IsCUDADevice(*sym))
+        TODO(loc, "CUDA Fortran: allocate with device source");
+
     // Generate a sequence of runtime calls.
     errorManager.genStatCheck(builder, loc);
     genAllocateObjectInit(box, allocatorIdx);
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index fc23e64..790b9fd 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(Support)
+add_subdirectory(Transforms)
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
new file mode 100644
index 0000000..4840a99
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -0,0 +1,191 @@
+//===- ACCRecipeBufferization.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bufferize OpenACC recipes that yield fir.box<T> to operate on
+// fir.ref<fir.box<T>> and update uses accordingly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+namespace fir::acc {
+#define GEN_PASS_DEF_ACCRECIPEBUFFERIZATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+} // namespace fir::acc
+
+namespace {
+
+class BufferizeInterface {
+public:
+  static std::optional<mlir::Type> mustBufferize(mlir::Type recipeType) {
+    if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(recipeType))
+      return fir::ReferenceType::get(boxTy);
+    return std::nullopt;
+  }
+
+  static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc,
+                               mlir::Value value) {
+    return builder.create<fir::LoadOp>(loc, value);
+  }
+
+  static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc,
+                                   mlir::Value value) {
+    auto alloca = builder.create<fir::AllocaOp>(loc, value.getType());
+    builder.create<fir::StoreOp>(loc, value, alloca);
+    return alloca;
+  }
+};
+
+static void bufferizeRegionArgsAndYields(mlir::Region &region,
+                                         mlir::Location loc, mlir::Type oldType,
+                                         mlir::Type newType) {
+  if (region.empty())
+    return;
+
+  mlir::OpBuilder builder(&region);
+  for (mlir::BlockArgument arg : region.getArguments()) {
+    if (arg.getType() == oldType) {
+      arg.setType(newType);
+      if (!arg.use_empty()) {
+        mlir::Operation *loadOp = BufferizeInterface::load(builder, loc, arg);
+        arg.replaceAllUsesExcept(loadOp->getResult(0), loadOp);
+      }
+    }
+  }
+  if (auto yield =
+          llvm::dyn_cast<mlir::acc::YieldOp>(region.back().getTerminator())) {
+    llvm::SmallVector<mlir::Value> newOperands;
+    newOperands.reserve(yield.getNumOperands());
+    bool changed = false;
+    for (mlir::Value oldYieldArg : yield.getOperands()) {
+      if (oldYieldArg.getType() == oldType) {
+        builder.setInsertionPoint(yield);
+        mlir::Value alloca =
+            BufferizeInterface::placeInMemory(builder, loc, oldYieldArg);
+        newOperands.push_back(alloca);
+        changed = true;
+      } else {
+        newOperands.push_back(oldYieldArg);
+      }
+    }
+    if (changed)
+      yield->setOperands(newOperands);
+  }
+}
+
+static void updateRecipeUse(mlir::ArrayAttr recipes, mlir::ValueRange operands,
+                            llvm::StringRef recipeSymName,
+                            mlir::Operation *computeOp) {
+  if (!recipes)
+    return;
+  for (auto [recipeSym, oldRes] : llvm::zip(recipes, operands)) {
+    if (llvm::cast<mlir::SymbolRefAttr>(recipeSym).getLeafReference() !=
+        recipeSymName)
+      continue;
+
+    mlir::Operation *dataOp = oldRes.getDefiningOp();
+    assert(dataOp && "dataOp must be paired with computeOp");
+    mlir::Location loc = dataOp->getLoc();
+    mlir::OpBuilder builder(dataOp);
+    llvm::TypeSwitch<mlir::Operation *, void>(dataOp)
+        .Case<mlir::acc::PrivateOp, mlir::acc::FirstprivateOp,
+              mlir::acc::ReductionOp>([&](auto privateOp) {
+          builder.setInsertionPointAfterValue(privateOp.getVar());
+          mlir::Value alloca = BufferizeInterface::placeInMemory(
+              builder, loc, privateOp.getVar());
+          privateOp.getVarMutable().assign(alloca);
+          privateOp.getAccVar().setType(alloca.getType());
+        });
+
+    llvm::SmallVector<mlir::Operation *> users(oldRes.getUsers().begin(),
+                                               oldRes.getUsers().end());
+    for (mlir::Operation *useOp : users) {
+      if (useOp == computeOp)
+        continue;
+      builder.setInsertionPoint(useOp);
+      mlir::Operation *load = BufferizeInterface::load(builder, loc, oldRes);
+      useOp->replaceUsesOfWith(oldRes, load->getResult(0));
+    }
+  }
+}
+
+class ACCRecipeBufferization
+    : public fir::acc::impl::ACCRecipeBufferizationBase<
+          ACCRecipeBufferization> {
+public:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    llvm::SmallVector<llvm::StringRef> recipeNames;
+    module.walk([&](mlir::Operation *recipe) {
+      llvm::TypeSwitch<mlir::Operation *, void>(recipe)
+          .Case<mlir::acc::PrivateRecipeOp, mlir::acc::FirstprivateRecipeOp,
+                mlir::acc::ReductionRecipeOp>([&](auto recipe) {
+            mlir::Type oldType = recipe.getType();
+            auto bufferizedType =
+                BufferizeInterface::mustBufferize(recipe.getType());
+            if (!bufferizedType)
+              return;
+            recipe.setTypeAttr(mlir::TypeAttr::get(*bufferizedType));
+            mlir::Location loc = recipe.getLoc();
+            using RecipeOp = decltype(recipe);
+            bufferizeRegionArgsAndYields(recipe.getInitRegion(), loc, oldType,
+                                         *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::FirstprivateRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCopyRegion(), loc, oldType,
+                                           *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::ReductionRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCombinerRegion(), loc,
+                                           oldType, *bufferizedType);
+            bufferizeRegionArgsAndYields(recipe.getDestroyRegion(), loc,
+                                         oldType, *bufferizedType);
+            recipeNames.push_back(recipe.getSymName());
+          });
+    });
+    if (recipeNames.empty())
+      return;
+
+    module.walk([&](mlir::Operation *op) {
+      llvm::TypeSwitch<mlir::Operation *, void>(op)
+          .Case<mlir::acc::LoopOp, mlir::acc::ParallelOp, mlir::acc::SerialOp>(
+              [&](auto computeOp) {
+                for (llvm::StringRef recipeName : recipeNames) {
+                  if (computeOp.getPrivatizationRecipes())
+                    updateRecipeUse(computeOp.getPrivatizationRecipesAttr(),
+                                    computeOp.getPrivateOperands(), recipeName,
+                                    op);
+                  if (computeOp.getFirstprivatizationRecipes())
+                    updateRecipeUse(
+                        computeOp.getFirstprivatizationRecipesAttr(),
+                        computeOp.getFirstprivateOperands(), recipeName, op);
+                  if (computeOp.getReductionRecipes())
+                    updateRecipeUse(computeOp.getReductionRecipesAttr(),
+                                    computeOp.getReductionOperands(),
+                                    recipeName, op);
+                }
+              });
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::acc::createACCRecipeBufferizationPass() {
+  return std::make_unique<ACCRecipeBufferization>();
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
new file mode 100644
index 0000000..2427da0
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_flang_library(FIROpenACCTransforms
+  ACCRecipeBufferization.cpp
+
+  DEPENDS
+  FIROpenACCPassesIncGen
+
+  LINK_LIBS
+  MLIRIR
+  MLIRPass
+  FIRDialect
+  MLIROpenACCDialect
+)
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index ea5e2c0..31e246c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3622,6 +3622,7 @@ void CheckHelper::CheckDioDtvArg(const Symbol &proc, const Symbol &subp,
                 ioKind == common::DefinedIo::ReadUnformatted
             ? Attr::INTENT_INOUT
             : Attr::INTENT_IN);
+    CheckDioDummyIsScalar(subp, *arg);
   }
 }
 
@@ -3687,6 +3688,7 @@ void CheckHelper::CheckDioAssumedLenCharacterArg(const Symbol &subp,
           "Dummy argument '%s' of a defined input/output procedure must be assumed-length CHARACTER of default kind"_err_en_US,
           arg->name());
     }
+    CheckDioDummyIsScalar(subp, *arg);
   }
 }
 
diff --git a/flang/test/Fir/OpenACC/recipe-bufferization.mlir b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
new file mode 100644
index 0000000..c4f96f6
--- /dev/null
+++ b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
@@ -0,0 +1,316 @@
+// RUN: fir-opt %s --fir-acc-recipe-bufferization -split-input-file | FileCheck %s
+
+// -----
+
+acc.private.recipe @priv_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.allocmem i32
+  %2 = fir.embox %1 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  %0 = fir.box_addr %arg1 : (!fir.box<i32>) -> !fir.ref<i32>
+  %1 = fir.convert %0 : (!fir.ref<i32>) -> !fir.heap<i32>
+  fir.freemem %1 : !fir.heap<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[DARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[DARG1:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LD1:.*]] = fir.load %[[DARG1]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[ADDR:.*]] = fir.box_addr %[[LD1]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[CVT:.*]] = fir.convert %[[ADDR]] : (!fir.ref<i32>) -> !fir.heap<i32>
+
+// -----
+
+// Test private recipe without destroy region.
+
+acc.private.recipe @priv_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.alloca i32
+  %2 = fir.embox %1 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: }
+
+// -----
+
+// Firstprivate recipe with destroy region.
+acc.firstprivate.recipe @fp_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP]] to %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.box<i32>>, %[[DST:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST:.*]] = fir.load %[[DST]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR:.*]] = fir.box_addr %[[LSRC]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL:.*]] = fir.load %[[SADDR]] : !fir.ref<i32>
+// CHECK:   %[[DADDR:.*]] = fir.box_addr %[[LDST]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL]] to %[[DADDR]] : !fir.ref<i32>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[FDARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[FDARG1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Firstprivate recipe without destroy region.
+acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP2:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP2]] to %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC2:.*]]: !fir.ref<!fir.box<i32>>, %[[DST2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC2:.*]] = fir.load %[[SRC2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST2:.*]] = fir.load %[[DST2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR2:.*]] = fir.box_addr %[[LSRC2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL2:.*]] = fir.load %[[SADDR2]] : !fir.ref<i32>
+// CHECK:   %[[DADDR2:.*]] = fir.box_addr %[[LDST2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL2]] to %[[DADDR2]] : !fir.ref<i32>
+
+// -----
+
+// Reduction recipe with destroy region.
+acc.reduction.recipe @red_ref_box : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR]] to %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS:.*]] = fir.load %[[LHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS:.*]] = fir.load %[[RHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR:.*]] = fir.box_addr %[[LLHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL:.*]] = fir.load %[[LADDR]] : !fir.ref<i32>
+// CHECK:   %[[RADDR:.*]] = fir.box_addr %[[LRHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL:.*]] = fir.load %[[RADDR]] : !fir.ref<i32>
+// CHECK:   %[[SUM:.*]] = arith.addi %[[LVAL]], %[[RVAL]] : i32
+// CHECK:   %[[I32ALLOCA:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM]] to %[[I32ALLOCA]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX:.*]] = fir.embox %[[I32ALLOCA]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX]] to %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[RD0:.*]]: !fir.ref<!fir.box<i32>>, %[[RD1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Reduction recipe without destroy region.
+acc.reduction.recipe @red_ref_box_no_destroy : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box_no_destroy : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR2:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR2]] to %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS2:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS2:.*]] = fir.load %[[LHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS2:.*]] = fir.load %[[RHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR2:.*]] = fir.box_addr %[[LLHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL2:.*]] = fir.load %[[LADDR2]] : !fir.ref<i32>
+// CHECK:   %[[RADDR2:.*]] = fir.box_addr %[[LRHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL2:.*]] = fir.load %[[RADDR2]] : !fir.ref<i32>
+// CHECK:   %[[SUM2:.*]] = arith.addi %[[LVAL2]], %[[RVAL2]] : i32
+// CHECK:   %[[I32ALLOCA2:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM2]] to %[[I32ALLOCA2]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX2:.*]] = fir.embox %[[I32ALLOCA2]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX2]] to %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+
+// -----
+
+// Comprehensive tests that also test recipe usages updates.
+
+acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+^bb0(%arg0: !fir.ref<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.declare %0 {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  acc.yield %1 : !fir.ref<i32>
+}
+acc.private.recipe @privatization_box_Uxf32 : !fir.box<!fir.array<?xf32>> init {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>):
+  %c0 = arith.constant 0 : index
+  %0:3 = fir.box_dims %arg0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  %1 = fir.shape %0#1 : (index) -> !fir.shape<1>
+  %2 = fir.allocmem !fir.array<?xf32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
+  %3 = fir.declare %2(%1) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+  %4 = fir.embox %3(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+  acc.yield %4 : !fir.box<!fir.array<?xf32>>
+} destroy {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.box<!fir.array<?xf32>>):
+  %0 = fir.box_addr %arg1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+  %1 = fir.convert %0 : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+  fir.freemem %1 : !fir.heap<!fir.array<?xf32>>
+  acc.terminator
+}
+func.func @_QPfoo(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+  %c200_i32 = arith.constant 200 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+  acc.parallel combined(loop) {
+    %4 = acc.private var(%3 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {name = "x"}
+    %5 = acc.private varPtr(%2 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+    acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %4 : !fir.box<!fir.array<?xf32>>, @privatization_ref_i32 -> %5 : !fir.ref<i32>) control(%arg1 : i32) = (%c1_i32 : i32) to (%c200_i32 : i32)  step (%c1_i32 : i32) {
+      %6 = fir.dummy_scope : !fir.dscope
+      %7 = fir.declare %4 dummy_scope %6 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+      %8 = fir.declare %5 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      %9 = fir.convert %arg1 : (i32) -> f32
+      %10 = fir.convert %arg1 : (i32) -> i64
+      %11 = fir.array_coor %7 %10 : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+      fir.store %9 to %11 : !fir.ref<f32>
+      acc.yield
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL:   acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+// CHECK:           %[[VAL_1:.*]] = fir.alloca i32
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           acc.yield %[[VAL_2]] : !fir.ref<i32>
+// CHECK:         }
+
+// CHECK-LABEL:   acc.private.recipe @privatization_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = fir.allocmem !fir.array<?xf32>, %[[VAL_3]]#1 {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:           %[[VAL_6:.*]] = fir.declare %[[VAL_5]](%[[VAL_4]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]](%[[VAL_4]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.yield %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+
+// CHECK-LABEL:   } destroy {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           fir.freemem %[[VAL_4]] : !fir.heap<!fir.array<?xf32>>
+// CHECK:           acc.terminator
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @_QPfoo(
+// CHECK-SAME:                      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 200 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_3]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_5]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.parallel combined(loop) {
+// CHECK:             %[[VAL_7:.*]] = acc.private varPtr(%[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>> {name = "x"}
+// CHECK:             %[[VAL_8:.*]] = acc.private varPtr(%[[VAL_4]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+// CHECK:             acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, @privatization_ref_i32 -> %[[VAL_8]] : !fir.ref<i32>) control(%[[VAL_9:.*]] : i32) = (%[[VAL_1]] : i32) to (%[[VAL_0]] : i32)  step (%[[VAL_1]] : i32) {
+// CHECK:               %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:               %[[VAL_12:.*]] = fir.declare %[[VAL_11]] dummy_scope %[[VAL_10]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:               %[[VAL_13:.*]] = fir.declare %[[VAL_8]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
+// CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+// CHECK:               %[[VAL_16:.*]] = fir.array_coor %[[VAL_12]] %[[VAL_15]] : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+// CHECK:               fir.store %[[VAL_14]] to %[[VAL_16]] : !fir.ref<f32>
+// CHECK:               acc.yield
+// CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+// CHECK:             acc.yield
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
new file mode 100644
index 0000000..3e59e2f
--- /dev/null
+++ b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
@@ -0,0 +1,9 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s
+
+program main
+  implicit none
+  integer, device, allocatable :: a_d(:)
+  integer, allocatable :: a(:)
+! CHECK: not yet implemented: CUDA Fortran: allocate with device source
+  allocate(a, source=a_d)
+end program
diff --git a/flang/test/Lower/CUDA/cuda-associate-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-associate-data-transfer.cuf
new file mode 100644
index 0000000..af850d5
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-associate-data-transfer.cuf
@@ -0,0 +1,21 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test detection of CUDA Fortran data transfer in presence of associuate
+! statement.
+
+module m
+  real(8), device, dimension(10,10,10) :: d
+end module m
+
+subroutine foo
+  use m
+  !@CUF associate(d1 => d)
+  d1 = 0.0
+  !@CUF end associate
+end subroutine
+
+! CHECK-LABEL: func.func @_QPfoo()
+! CHECK: %[[D:.*]] = fir.address_of(@_QMmEd) : !fir.ref<!fir.array<10x10x10xf64>>
+! CHECK: %[[D_DECL:.*]]:2 = hlfir.declare %[[D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmEd"} : (!fir.ref<!fir.array<10x10x10xf64>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x10x10xf64>>, !fir.ref<!fir.array<10x10x10xf64>>)
+! CHECK: %[[D1_DECL:.*]]:2 = hlfir.declare %[[D_DECL]]#0(%4) {uniq_name = "_QFfooEd1"} : (!fir.ref<!fir.array<10x10x10xf64>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x10x10xf64>>, !fir.ref<!fir.array<10x10x10xf64>>)
+! CHECK: cuf.data_transfer %{{.*}} to %[[D1_DECL]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : f64, !fir.ref<!fir.array<10x10x10xf64>>
diff --git a/flang/test/Semantics/dynamic-type-intrinsics.f90 b/flang/test/Semantics/dynamic-type-intrinsics.f90
new file mode 100644
index 0000000..a4ce3db
--- /dev/null
+++ b/flang/test/Semantics/dynamic-type-intrinsics.f90
@@ -0,0 +1,73 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+
+module m
+    type :: t1
+      real :: x
+    end type
+    type :: t2(k)
+      integer, kind :: k
+      real(kind=k) :: x
+    end type
+    type :: t3
+      real :: x
+    end type
+    type, extends(t1) :: t4
+      integer :: y
+    end type
+    type :: t5
+      sequence
+      integer :: x
+      integer :: y
+    end type
+
+    integer :: i
+    real :: r
+    type(t1) :: x1, y1
+    type(t2(4)) :: x24, y24
+    type(t2(8)) :: x28
+    type(t3) :: x3
+    type(t4) :: x4
+    type(t5) :: x5
+    class(t1), allocatable :: a1
+    class(t3), allocatable :: a3
+
+    integer(kind=merge(kind(1),-1,same_type_as(x1, x1))) same_type_as_x1_x1_true
+    integer(kind=merge(kind(1),-1,same_type_as(x1, y1))) same_type_as_x1_y1_true
+    integer(kind=merge(kind(1),-1,same_type_as(x24, x24))) same_type_as_x24_x24_true
+    integer(kind=merge(kind(1),-1,same_type_as(x24, y24))) same_type_as_x24_y24_true
+    integer(kind=merge(kind(1),-1,same_type_as(x24, x28))) same_type_as_x24_x28_true
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,same_type_as(x1, x3))) same_type_as_x1_x3_false
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,same_type_as(a1, a3))) same_type_as_a1_a3_false
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t1_8 = same_type_as(x5, x5)
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t1_9 = same_type_as(x5, x1)
+    !ERROR: Actual argument for 'b=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t1_10 = same_type_as(x1, x5)
+    !ERROR: Actual argument for 'a=' has bad type 'INTEGER(4)', expected extensible or unlimited polymorphic type
+    logical :: t1_11 = same_type_as(i, i)
+    !ERROR: Actual argument for 'a=' has bad type 'REAL(4)', expected extensible or unlimited polymorphic type
+    logical :: t1_12 = same_type_as(r, r)
+    !ERROR: Actual argument for 'a=' has bad type 'INTEGER(4)', expected extensible or unlimited polymorphic type
+    logical :: t1_13 = same_type_as(i, t)
+
+    integer(kind=merge(kind(1),-1,extends_type_of(x1, y1))) extends_type_of_x1_y1_true
+    integer(kind=merge(kind(1),-1,extends_type_of(x24, x24))) extends_type_of_x24_x24_true
+    integer(kind=merge(kind(1),-1,extends_type_of(x24, y24))) extends_type_of_x24_y24_true
+    integer(kind=merge(kind(1),-1,extends_type_of(x24, x28))) extends_type_of_x24_x28_true
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,extends_type_of(x1, x3))) extends_type_of_x1_x3_false
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,extends_type_of(a1, a3))) extends_type_of_a1_a3_false
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,extends_type_of(x1, x4))) extends_type_of_x1_x4_false
+    integer(kind=merge(kind(1),-1,extends_type_of(x4, x1))) extends_type_of_x4_x1_true
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t2_9 = extends_type_of(x5, x5)
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t2_10 = extends_type_of(x5, x1)
+    !ERROR: Actual argument for 'mold=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t2_11 = extends_type_of(x1, x5)
+end module
diff --git a/flang/test/Semantics/io11.f90 b/flang/test/Semantics/io11.f90
index c00deed..6bb7a71 100644
--- a/flang/test/Semantics/io11.f90
+++ b/flang/test/Semantics/io11.f90
@@ -809,3 +809,24 @@ module m29
     end
   end interface
 end
+
+module m30
+    type base
+        character(5), allocatable :: data
+    end type
+    interface write(formatted)
+        subroutine formattedRead (dtv, unit, iotype, v_list, iostat, iomsg)
+        import base
+            !ERROR: Dummy argument 'dtv' of a defined input/output procedure must be a scalar
+            class (base), intent(in) :: dtv(10)
+            integer, intent(in) :: unit
+            !ERROR: Dummy argument 'iotype' of a defined input/output procedure must be a scalar
+            character(*), intent(in) :: iotype(2)
+            integer, intent(in) :: v_list(:)
+            !ERROR: Dummy argument 'iostat' of a defined input/output procedure must be a scalar
+            integer, intent(out) :: iostat(*)
+            !ERROR: Dummy argument 'iomsg' of a defined input/output procedure must be a scalar
+            character(*), intent(inout) :: iomsg(:)
+        end subroutine
+    end interface
+end module
diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt
index 4ee9752..c5bd439 100644
--- a/flang/tools/fir-opt/CMakeLists.txt
+++ b/flang/tools/fir-opt/CMakeLists.txt
@@ -22,6 +22,7 @@ target_link_libraries(fir-opt PRIVATE
   HLFIRDialect
   HLFIRTransforms
   FIROpenACCSupport
+  FIROpenACCTransforms
   FIROpenMPSupport
   FlangOpenMPTransforms
   FIRAnalysis
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index d66fc3f..b0b277b 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Transforms/Passes.h"
@@ -37,6 +38,7 @@ int main(int argc, char **argv) {
   fir::registerOptTransformPasses();
   hlfir::registerHLFIRPasses();
   flangomp::registerFlangOpenMPPasses();
+  fir::acc::registerFIROpenACCPasses();
 #ifdef FLANG_INCLUDE_TESTS
   fir::test::registerTestFIRAliasAnalysisPass();
   fir::test::registerTestFIROpenACCInterfacesPass();
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index afa90e6..81360aa 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -632,8 +632,9 @@ add_header_macro(
   sys/time.h
   DEPENDS
     .llvm_libc_common_h
-    .llvm-libc-types.struct_timeval
     .llvm-libc-macros.sys_time_macros
+    .llvm-libc-types.struct_itimerval
+    .llvm-libc-types.struct_timeval
 )
 
 add_header_macro(
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index 2a0c563..394437b 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -197,12 +197,26 @@ functions:
       - type: FILE *
       - type: long
       - type: int
+  - name: fseeko
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: FILE *
+      - type: off_t
+      - type: int
   - name: ftell
     standards:
       - stdc
     return_type: long
     arguments:
       - type: FILE *
+  - name: ftello
+    standards:
+      - POSIX
+    return_type: off_t
+    arguments:
+      - type: FILE *
   - name: funlockfile
     standards:
       - POSIX
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index ba0fc4b..088edc0 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -124,7 +124,7 @@ if( EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} )
 endif()
 
 foreach( tool IN ITEMS clang opt llvm-as llvm-link )
-  if( NOT EXISTS "${${tool}_exe}" AND "${tool}_target" STREQUAL "" )
+  if( NOT EXISTS "${${tool}_exe}" AND "${${tool}_target}" STREQUAL "" )
     message( FATAL_ERROR "libclc toolchain incomplete - missing tool ${tool}!" )
   endif()
 endforeach()
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 1a450218..5e8fe2e 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -68,6 +68,9 @@ Improvements and New Features
   reduced debug information.
 
 - The performance of ``std::find`` has been improved by up to 2x for integral types
+- The ``std::distance`` and ``std::ranges::distance`` algorithms have been optimized for segmented iterators (e.g.,
+  ``std::join_view`` iterators), reducing the complexity from ``O(n)`` to ``O(n / segment_size)``. Benchmarks show
+  performance improvements of over 1600x in favorable cases with large segment sizes (e.g., 1024).
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/include/__cxx03/__bit_reference b/libcxx/include/__cxx03/__bit_reference
index 76027e2..ac0005f 100644
--- a/libcxx/include/__cxx03/__bit_reference
+++ b/libcxx/include/__cxx03/__bit_reference
@@ -167,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
       unsigned __clz       = __bits_per_word - __first.__ctz_;
       difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz - __dn));
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -185,7 +185,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
     // do last word
     if (__n > 0) {
       __first.__seg_ += __nw;
-      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __m = __storage_type(~0) >> (__bits_per_word - __n);
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -210,11 +210,11 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
       unsigned __clz_f     = __bits_per_word - __first.__ctz_;
       difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __m   = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz_f - __dn));
       __storage_type __b   = *__first.__seg_ & __m;
       unsigned __clz_r     = __bits_per_word - __result.__ctz_;
       __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
-      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      __m                  = (__storage_type(~0) << __result.__ctz_) & (__storage_type(~0) >> (__clz_r - __ddn));
       *__result.__seg_ &= ~__m;
       if (__result.__ctz_ > __first.__ctz_)
         *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
@@ -224,7 +224,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
       __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
       __dn -= __ddn;
       if (__dn > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        __m = __storage_type(~0) >> (__bits_per_word - __dn);
         *__result.__seg_ &= ~__m;
         *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
         __result.__ctz_ = static_cast<unsigned>(__dn);
@@ -235,7 +235,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
     // __first.__ctz_ == 0;
     // do middle words
     unsigned __clz_r   = __bits_per_word - __result.__ctz_;
-    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    __storage_type __m = __storage_type(~0) << __result.__ctz_;
     for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
       __storage_type __b = *__first.__seg_;
       *__result.__seg_ &= ~__m;
@@ -246,17 +246,17 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
     }
     // do last word
     if (__n > 0) {
-      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __m                 = __storage_type(~0) >> (__bits_per_word - __n);
       __storage_type __b  = *__first.__seg_ & __m;
       __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
-      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      __m                 = (__storage_type(~0) << __result.__ctz_) & (__storage_type(~0) >> (__clz_r - __dn));
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b << __result.__ctz_;
       __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
       __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
       __n -= __dn;
       if (__n > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        __m = __storage_type(~0) >> (__bits_per_word - __n);
         *__result.__seg_ &= ~__m;
         *__result.__seg_ |= __b >> __dn;
         __result.__ctz_ = static_cast<unsigned>(__n);
diff --git a/libcxx/include/__cxx03/__verbose_abort b/libcxx/include/__cxx03/__verbose_abort
index 4fcfffa..52d1297 100644
--- a/libcxx/include/__cxx03/__verbose_abort
+++ b/libcxx/include/__cxx03/__verbose_abort
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // This function should never be called directly from the code -- it should only be called through
 // the _LIBCPP_VERBOSE_ABORT macro.
 _LIBCPP_NORETURN _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
-_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
+_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...) _NOEXCEPT;
 
 // _LIBCPP_VERBOSE_ABORT(format, args...)
 //
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 74923ddb..6b65e73 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -808,7 +808,7 @@ public:
           }
           {
             __node_holder __h = __construct_node_hash(__hash, std::forward<_Args>(__args2)...);
-            if (size() + 1 > __bc * max_load_factor() || __bc == 0) {
+            if (size() + 1 > __bc * max_load_factor()) {
               __rehash_unique(std::max<size_type>(2 * __bc + !std::__is_hash_power2(__bc),
                                                   size_type(__math::ceil(float(size() + 1) / max_load_factor()))));
               __bc    = bucket_count();
diff --git a/libcxx/include/__iterator/distance.h b/libcxx/include/__iterator/distance.h
index 1732aa5..9be9db0 100644
--- a/libcxx/include/__iterator/distance.h
+++ b/libcxx/include/__iterator/distance.h
@@ -10,41 +10,71 @@
 #ifndef _LIBCPP___ITERATOR_DISTANCE_H
 #define _LIBCPP___ITERATOR_DISTANCE_H
 
+#include <__algorithm/for_each_segment.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/size.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/remove_cvref.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIter>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type
-__distance(_InputIter __first, _InputIter __last, input_iterator_tag) {
-  typename iterator_traits<_InputIter>::difference_type __r(0);
+#if _LIBCPP_STD_VER >= 20
+template <class _Iter>
+using __iter_distance_t _LIBCPP_NODEBUG = std::iter_difference_t<_Iter>;
+#else
+template <class _Iter>
+using __iter_distance_t _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type;
+#endif
+
+template <class _InputIter, class _Sent>
+inline _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_InputIter> __distance(_InputIter __first, _Sent __last) {
+  __iter_distance_t<_InputIter> __r(0);
   for (; __first != __last; ++__first)
     ++__r;
   return __r;
 }
 
-template <class _RandIter>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_RandIter>::difference_type
-__distance(_RandIter __first, _RandIter __last, random_access_iterator_tag) {
+template <class _RandIter, __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_RandIter>
+__distance(_RandIter __first, _RandIter __last) {
   return __last - __first;
 }
 
+#if _LIBCPP_STD_VER >= 20
+template <class _SegmentedIter,
+          __enable_if_t<!__has_random_access_iterator_category<_SegmentedIter>::value &&
+                            __is_segmented_iterator_v<_SegmentedIter>,
+                        int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_SegmentedIter>
+__distance(_SegmentedIter __first, _SegmentedIter __last) {
+  __iter_distance_t<_SegmentedIter> __r(0);
+  std::__for_each_segment(__first, __last, [&__r](auto __lfirst, auto __llast) {
+    __r += std::__distance(__lfirst, __llast);
+  });
+  return __r;
+}
+#endif // _LIBCPP_STD_VER >= 20
+
 template <class _InputIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type
 distance(_InputIter __first, _InputIter __last) {
-  return std::__distance(__first, __last, typename iterator_traits<_InputIter>::iterator_category());
+  return std::__distance(__first, __last);
 }
 
 #if _LIBCPP_STD_VER >= 20
@@ -56,12 +86,7 @@ struct __distance {
   template <class _Ip, sentinel_for<_Ip> _Sp>
     requires(!sized_sentinel_for<_Sp, _Ip>)
   _LIBCPP_HIDE_FROM_ABI constexpr iter_difference_t<_Ip> operator()(_Ip __first, _Sp __last) const {
-    iter_difference_t<_Ip> __n = 0;
-    while (__first != __last) {
-      ++__first;
-      ++__n;
-    }
-    return __n;
+    return std::__distance(std::move(__first), std::move(__last));
   }
 
   template <class _Ip, sized_sentinel_for<decay_t<_Ip>> _Sp>
@@ -92,4 +117,6 @@ inline constexpr auto distance = __distance{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ITERATOR_DISTANCE_H
diff --git a/libcxx/include/__utility/cmp.h b/libcxx/include/__utility/cmp.h
index 14dc0c1..68864e2 100644
--- a/libcxx/include/__utility/cmp.h
+++ b/libcxx/include/__utility/cmp.h
@@ -26,10 +26,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
+template <typename _Tp, typename _Ip>
+concept __comparison_can_promote_to =
+    sizeof(_Tp) < sizeof(_Ip) || (sizeof(_Tp) == sizeof(_Ip) && __signed_integer<_Tp>);
+
 template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t == __u;
+  else if constexpr (__comparison_can_promote_to<_Tp, int> && __comparison_can_promote_to<_Up, int>)
+    return static_cast<int>(__t) == static_cast<int>(__u);
+  else if constexpr (__comparison_can_promote_to<_Tp, long long> && __comparison_can_promote_to<_Up, long long>)
+    return static_cast<long long>(__t) == static_cast<long long>(__u);
   else if constexpr (is_signed_v<_Tp>)
     return __t < 0 ? false : make_unsigned_t<_Tp>(__t) == __u;
   else
@@ -45,6 +53,10 @@ template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t < __u;
+  else if constexpr (__comparison_can_promote_to<_Tp, int> && __comparison_can_promote_to<_Up, int>)
+    return static_cast<int>(__t) < static_cast<int>(__u);
+  else if constexpr (__comparison_can_promote_to<_Tp, long long> && __comparison_can_promote_to<_Up, long long>)
+    return static_cast<long long>(__t) < static_cast<long long>(__u);
   else if constexpr (is_signed_v<_Tp>)
     return __t < 0 ? true : make_unsigned_t<_Tp>(__t) < __u;
   else
diff --git a/libcxx/test/benchmarks/iterators/distance.bench.cpp b/libcxx/test/benchmarks/iterators/distance.bench.cpp
new file mode 100644
index 0000000..186ef79
--- /dev/null
+++ b/libcxx/test/benchmarks/iterators/distance.bench.cpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <iterator>
+#include <ranges>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+  auto std_distance = [](auto first, auto last) { return std::distance(first, last); };
+
+  // {std,ranges}::distance(std::deque)
+  {
+    auto bm = [](std::string name, auto distance) {
+      benchmark::RegisterBenchmark(
+          name,
+          [distance](auto& st) {
+            std::size_t const size = st.range(0);
+            std::deque<int> c(size, 1);
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = distance(c.begin(), c.end());
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192);
+    };
+    bm.operator()("std::distance(deque<int>)", std_distance);
+    bm.operator()("rng::distance(deque<int>)", std::ranges::distance);
+  }
+
+  // {std,ranges}::distance(std::join_view)
+  {
+    auto bm = []<class Container>(std::string name, auto distance, std::size_t seg_size) {
+      benchmark::RegisterBenchmark(
+          name,
+          [distance, seg_size](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+            auto last  = view.end();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = distance(first, last);
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::distance(join_view(vector<vector<int>>))", std_distance, 256);
+    bm.operator()<std::vector<std::vector<int>>>(
+        "rng::distance(join_view(vector<vector<int>>)", std::ranges::distance, 256);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/libcxx/test/benchmarks/utility/cmp.bench.cpp b/libcxx/test/benchmarks/utility/cmp.bench.cpp
new file mode 100644
index 0000000..1ed179a
--- /dev/null
+++ b/libcxx/test/benchmarks/utility/cmp.bench.cpp
@@ -0,0 +1,139 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <utility>
+#include "../CartesianBenchmarks.h"
+#include "benchmark/benchmark.h"
+
+namespace {
+
+enum ValueType : size_t {
+  SChar,
+  UChar,
+  Short,
+  UShort,
+  Int,
+  UInt,
+  Long,
+  ULong,
+  LongLong,
+  ULongLong,
+#ifndef TEST_HAS_NO_INT128
+  Int128,
+  UInt128,
+#endif
+};
+
+struct AllValueTypes : EnumValuesAsTuple<AllValueTypes, ValueType, 6> {
+  static constexpr const char* Names[] = {
+      "schar",
+      "uchar",
+      "short",
+      "ushort",
+      "int",
+      "uint",
+      "long",
+      "ulong",
+      "longlong",
+      "ulonglong",
+#ifndef TEST_HAS_NO_INT128
+      "int128",
+      "uint128"
+#endif
+  };
+};
+
+using TestType =
+    std::tuple< signed char,
+                unsigned char,
+                short,
+                unsigned short,
+                int,
+                unsigned int,
+                long,
+                unsigned long,
+                long long,
+                unsigned long long
+#ifndef TEST_HAS_NO_INT128
+                ,
+                __int128_t,
+                __uint128_t
+#endif
+                >;
+
+template <typename TType, typename UType>
+struct CmpEqual {
+  static void run(benchmark::State& state) {
+    using T = std::tuple_element_t<TType::value, TestType>;
+    using U = std::tuple_element_t<UType::value, TestType>;
+
+    T x1 = T{127}, x2 = T{111};
+    U y1 = U{123}, y2 = U{1};
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(x1);
+      benchmark::DoNotOptimize(x2);
+      benchmark::DoNotOptimize(y1);
+      benchmark::DoNotOptimize(y2);
+      benchmark::DoNotOptimize(std::cmp_equal(x1, y1));
+      benchmark::DoNotOptimize(std::cmp_equal(y1, x1));
+      benchmark::DoNotOptimize(std::cmp_equal(x1, x1));
+      benchmark::DoNotOptimize(std::cmp_equal(y1, y1));
+
+      benchmark::DoNotOptimize(std::cmp_equal(x2, y2));
+      benchmark::DoNotOptimize(std::cmp_equal(y2, x2));
+      benchmark::DoNotOptimize(std::cmp_equal(x2, x2));
+      benchmark::DoNotOptimize(std::cmp_equal(y2, y2));
+    }
+  }
+
+  static std::string name() { return "BM_CmpEqual" + TType::name() + UType::name(); }
+};
+
+template <typename TType, typename UType>
+struct CmpLess {
+  static void run(benchmark::State& state) {
+    using T = std::tuple_element_t<TType::value, TestType>;
+    using U = std::tuple_element_t<UType::value, TestType>;
+
+    T x1 = T{127}, x2 = T{111};
+    U y1 = U{123}, y2 = U{1};
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(x1);
+      benchmark::DoNotOptimize(x2);
+      benchmark::DoNotOptimize(y1);
+      benchmark::DoNotOptimize(y2);
+      benchmark::DoNotOptimize(std::cmp_less(x1, y1));
+      benchmark::DoNotOptimize(std::cmp_less(y1, x1));
+      benchmark::DoNotOptimize(std::cmp_less(x1, x1));
+      benchmark::DoNotOptimize(std::cmp_less(y1, y1));
+
+      benchmark::DoNotOptimize(std::cmp_less(x2, y2));
+      benchmark::DoNotOptimize(std::cmp_less(y2, x2));
+      benchmark::DoNotOptimize(std::cmp_less(x2, x2));
+      benchmark::DoNotOptimize(std::cmp_less(y2, y2));
+    }
+  }
+
+  static std::string name() { return "BM_CmpLess" + TType::name() + UType::name(); }
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  makeCartesianProductBenchmark<CmpEqual, AllValueTypes, AllValueTypes>();
+  makeCartesianProductBenchmark<CmpLess, AllValueTypes, AllValueTypes>();
+  benchmark::RunSpecifiedBenchmarks();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp b/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
index 390c6b6..3c7a2d4 100644
--- a/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
+++ b/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
@@ -12,9 +12,7 @@
 // failures when back-deploying.
 // XFAIL: availability-verbose_abort-missing
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
-#include <__verbose_abort>
+#include <__cxx03/__verbose_abort>
 #include <cstdlib>
 
 void std::__libcpp_verbose_abort(char const*, ...) _NOEXCEPT { std::exit(EXIT_SUCCESS); }
diff --git a/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 7ead65c..a9fe04f 100644
--- a/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -21,8 +21,6 @@
 // GCC doesn't support the aligned-allocation flags.
 // XFAIL: gcc
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // RUN: %{build} -faligned-allocation -fsized-deallocation
 // RUN: %{run}
 // RUN: %{build} -faligned-allocation -fno-sized-deallocation -DNO_SIZE
@@ -40,7 +38,7 @@
 
 TEST_DIAGNOSTIC_PUSH
 TEST_CLANG_DIAGNOSTIC_IGNORED("-Wprivate-header")
-#include <__memory/aligned_alloc.h>
+#include <__cxx03/__memory/aligned_alloc.h>
 TEST_DIAGNOSTIC_POP
 
 struct alloc_stats {
@@ -138,42 +136,42 @@ void test_libcpp_dealloc() {
   std::size_t with_size_val   = 2;
 
   {
-    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), under_align_val);
+    std::__libcpp_deallocate_unsized(p, under_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 
 #if defined(NO_SIZE) && defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 #elif defined(NO_SIZE)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
 #elif defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
 #else
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_size_align(with_size_val, over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), over_align_val);
+    std::__libcpp_deallocate_unsized(p, over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), under_align_val);
+    std::__libcpp_deallocate(p, with_size_val, under_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
index 3c9cf9b..bede567 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
@@ -12,8 +12,6 @@
 //   constexpr OutIter   // constexpr after C++17
 //   copy(InIter first, InIter last, OutIter result);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <vector>
diff --git a/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp
index 13caeff..d92a44f 100644
--- a/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp
@@ -16,38 +16,73 @@
 //   Iter::difference_type
 //   distance(Iter first, Iter last); // constexpr in C++17
 
-#include <iterator>
+#include <array>
 #include <cassert>
+#include <deque>
+#include <iterator>
+#include <vector>
 #include <type_traits>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
 template <class It>
-TEST_CONSTEXPR_CXX17
-void check_distance(It first, It last, typename std::iterator_traits<It>::difference_type dist)
-{
-    typedef typename std::iterator_traits<It>::difference_type Difference;
-    static_assert(std::is_same<decltype(std::distance(first, last)), Difference>::value, "");
-    assert(std::distance(first, last) == dist);
+TEST_CONSTEXPR_CXX17 void check_distance(It first, It last, typename std::iterator_traits<It>::difference_type dist) {
+  typedef typename std::iterator_traits<It>::difference_type Difference;
+  static_assert(std::is_same<decltype(std::distance(first, last)), Difference>::value, "");
+  assert(std::distance(first, last) == dist);
 }
 
-TEST_CONSTEXPR_CXX17 bool tests()
-{
-    const char* s = "1234567890";
-    check_distance(cpp17_input_iterator<const char*>(s), cpp17_input_iterator<const char*>(s+10), 10);
-    check_distance(forward_iterator<const char*>(s), forward_iterator<const char*>(s+10), 10);
-    check_distance(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s+10), 10);
-    check_distance(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+10), 10);
-    check_distance(s, s+10, 10);
-    return true;
+#if TEST_STD_VER >= 20
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  using Container = std::deque<std::deque<double>>;
+  Container c;
+  auto view                    = c | std::views::join;
+  Container::difference_type n = 0;
+  for (std::size_t i = 0; i < 10; ++i) {
+    n += i;
+    c.push_back(Container::value_type(i));
+  }
+  assert(std::distance(view.begin(), view.end()) == n);
+}
+#endif
+
+TEST_CONSTEXPR_CXX17 bool tests() {
+  const char* s = "1234567890";
+  check_distance(cpp17_input_iterator<const char*>(s), cpp17_input_iterator<const char*>(s + 10), 10);
+  check_distance(forward_iterator<const char*>(s), forward_iterator<const char*>(s + 10), 10);
+  check_distance(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s + 10), 10);
+  check_distance(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s + 10), 10);
+  check_distance(s, s + 10, 10);
+
+#if TEST_STD_VER >= 20
+  {
+    using Container = std::vector<std::vector<int>>;
+    Container c;
+    auto view                    = c | std::views::join;
+    Container::difference_type n = 0;
+    for (std::size_t i = 0; i < 10; ++i) {
+      n += i;
+      c.push_back(Container::value_type(i));
+    }
+    assert(std::distance(view.begin(), view.end()) == n);
+  }
+  {
+    using Container = std::array<std::array<char, 3>, 10>;
+    Container c;
+    auto view = c | std::views::join;
+    assert(std::distance(view.begin(), view.end()) == 30);
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+#endif
+  return true;
 }
 
-int main(int, char**)
-{
-    tests();
+int main(int, char**) {
+  tests();
 #if TEST_STD_VER >= 17
-    static_assert(tests(), "");
+  static_assert(tests(), "");
 #endif
-    return 0;
+  return 0;
 }
diff --git a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp
index b4199b7..1b78489 100644
--- a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp
@@ -15,18 +15,21 @@
 // template<class I, sized_sentinel_for<decay_t<I>> S>
 //   constexpr iter_difference_t<I> ranges::distance(I&& first, S last); // TODO: update when LWG3664 is resolved
 
-#include <iterator>
+#include <array>
 #include <cassert>
+#include <deque>
+#include <iterator>
+#include <vector>
 
 #include "test_iterators.h"
 #include "test_macros.h"
 
-template<class It, class Sent>
+template <class It, class Sent>
 constexpr void test_unsized() {
   static_assert(std::sentinel_for<Sent, It> && !std::sized_sentinel_for<Sent, It>);
-  int a[3] = {1,2,3};
+  int a[3] = {1, 2, 3};
   {
-    It first = It(a);
+    It first  = It(a);
     auto last = Sent(It(a));
     assert(std::ranges::distance(first, last) == 0);
     assert(std::ranges::distance(It(a), last) == 0);
@@ -36,7 +39,7 @@ constexpr void test_unsized() {
   }
   {
     auto check = [&a]<class ItQual, class SentQual> {
-      It first = It(a);
+      It first  = It(a);
       Sent last = Sent(It(a + 3));
       assert(std::ranges::distance(static_cast<ItQual>(first), static_cast<SentQual>(last)) == 3);
     };
@@ -61,13 +64,13 @@ constexpr void test_unsized() {
   }
 }
 
-template<class It, class Sent>
+template <class It, class Sent>
 constexpr void test_sized() {
   static_assert(std::sized_sentinel_for<Sent, It>);
-  int a[] = {1,2,3};
+  int a[] = {1, 2, 3};
   {
     auto check = [&a]<class ItQual, class SentQual> {
-      It first = It(a + 3);
+      It first  = It(a + 3);
       Sent last = Sent(It(a));
       assert(std::ranges::distance(static_cast<ItQual>(first), static_cast<SentQual>(last)) == -3);
     };
@@ -91,7 +94,7 @@ constexpr void test_sized() {
     check.template operator()<const It&&, const Sent&&>();
   }
   {
-    It first = It(a);
+    It first  = It(a);
     auto last = Sent(It(a));
     assert(std::ranges::distance(first, last) == 0);
     assert(std::ranges::distance(It(a), last) == 0);
@@ -100,7 +103,7 @@ constexpr void test_sized() {
     ASSERT_SAME_TYPE(decltype(std::ranges::distance(It(a), Sent(It(a)))), std::iter_difference_t<It>);
   }
   {
-    It first = It(a);
+    It first  = It(a);
     auto last = Sent(It(a + 3));
     assert(std::ranges::distance(first, last) == 3);
     assert(std::ranges::distance(It(a), last) == 3);
@@ -110,13 +113,17 @@ constexpr void test_sized() {
 }
 
 struct StrideCounter {
-  int *it_;
-  int *inc_;
-  using value_type = int;
+  int* it_;
+  int* inc_;
+  using value_type      = int;
   using difference_type = int;
   explicit StrideCounter();
-  constexpr explicit StrideCounter(int *it, int *inc) : it_(it), inc_(inc) {}
-  constexpr auto& operator++() { ++it_; *inc_ += 1; return *this; }
+  constexpr explicit StrideCounter(int* it, int* inc) : it_(it), inc_(inc) {}
+  constexpr auto& operator++() {
+    ++it_;
+    *inc_ += 1;
+    return *this;
+  }
   StrideCounter operator++(int);
   int& operator*() const;
   bool operator==(StrideCounter) const;
@@ -125,11 +132,11 @@ static_assert(std::forward_iterator<StrideCounter>);
 static_assert(!std::sized_sentinel_for<StrideCounter, StrideCounter>);
 
 struct SizedStrideCounter {
-  int *it_;
-  int *minus_;
+  int* it_;
+  int* minus_;
   using value_type = int;
   explicit SizedStrideCounter();
-  constexpr explicit SizedStrideCounter(int *it, int *minus) : it_(it), minus_(minus) {}
+  constexpr explicit SizedStrideCounter(int* it, int* minus) : it_(it), minus_(minus) {}
   SizedStrideCounter& operator++();
   SizedStrideCounter operator++(int);
   int& operator*() const;
@@ -147,22 +154,34 @@ constexpr void test_stride_counting() {
     int a[] = {1, 2, 3};
     int inc = 0;
     StrideCounter first(a, &inc);
-    StrideCounter last(a+3, nullptr);
+    StrideCounter last(a + 3, nullptr);
     std::same_as<int> auto result = std::ranges::distance(first, last);
     assert(result == 3);
     assert(inc == 3);
   }
   {
-    int a[] = {1, 2, 3};
+    int a[]   = {1, 2, 3};
     int minus = 0;
     SizedStrideCounter first(a, &minus);
-    SizedStrideCounter last(a+3, nullptr);
+    SizedStrideCounter last(a + 3, nullptr);
     std::same_as<int> auto result = std::ranges::distance(first, last);
     assert(result == 3);
     assert(minus == 1);
   }
 }
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  using Container = std::deque<std::deque<double>>;
+  Container c;
+  auto view                    = c | std::views::join;
+  Container::difference_type n = 0;
+  for (std::size_t i = 0; i < 10; ++i) {
+    n += i;
+    c.push_back(Container::value_type(i));
+  }
+  assert(std::ranges::distance(view.begin(), view.end()) == n);
+}
+
 constexpr bool test() {
   {
     int a[] = {1, 2, 3};
@@ -197,7 +216,7 @@ constexpr bool test() {
   test_sized<contiguous_iterator<int*>, contiguous_iterator<int*>>();
 
   {
-    using It = cpp20_input_iterator<int*>;  // non-copyable, thus not a sentinel for itself
+    using It = cpp20_input_iterator<int*>; // non-copyable, thus not a sentinel for itself
     static_assert(!std::is_copy_constructible_v<It>);
     static_assert(!std::sentinel_for<It, It>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, It&>);
@@ -206,10 +225,10 @@ constexpr bool test() {
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&&, It&&>);
   }
   {
-    using It = cpp20_input_iterator<int*>;  // non-copyable
-    using Sent = sentinel_wrapper<It>;  // not a sized sentinel
+    using It   = cpp20_input_iterator<int*>; // non-copyable
+    using Sent = sentinel_wrapper<It>;       // not a sized sentinel
     static_assert(std::sentinel_for<Sent, It> && !std::sized_sentinel_for<Sent, It>);
-    int a[] = {1,2,3};
+    int a[]   = {1, 2, 3};
     Sent last = Sent(It(a + 3));
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, Sent&>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, Sent&&>);
@@ -217,7 +236,7 @@ constexpr bool test() {
     assert(std::ranges::distance(It(a), Sent(It(a + 3))) == 3);
   }
   {
-    using It = cpp17_input_iterator<int*>;  // not a sentinel for itself
+    using It = cpp17_input_iterator<int*>; // not a sentinel for itself
     static_assert(!std::sentinel_for<It, It>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, It&>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, It&&>);
@@ -231,6 +250,26 @@ constexpr bool test() {
   static_assert(!std::is_invocable_v<decltype(std::ranges::distance), int, int*>);
   static_assert(!std::is_invocable_v<decltype(std::ranges::distance), int*, char*>);
 
+  {
+    using Container = std::vector<std::vector<int>>;
+    Container c;
+    auto view                    = c | std::views::join;
+    Container::difference_type n = 0;
+    for (std::size_t i = 0; i < 10; ++i) {
+      n += i;
+      c.push_back(Container::value_type(i));
+    }
+    assert(std::ranges::distance(view.begin(), view.end()) == n);
+  }
+  {
+    using Container = std::array<std::array<char, 3>, 10>;
+    Container c;
+    auto view = c | std::views::join;
+    assert(std::ranges::distance(view.begin(), view.end()) == 30);
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
   return true;
 }
 
diff --git a/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
index 69296df..d289ef6 100644
--- a/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
@@ -10,7 +10,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+// UNSUPPORTED: c++03
 
 #include <cstdalign>
 
diff --git a/lldb/include/lldb/Utility/XcodeSDK.h b/lldb/include/lldb/Utility/XcodeSDK.h
index 5b345a4..5f89019 100644
--- a/lldb/include/lldb/Utility/XcodeSDK.h
+++ b/lldb/include/lldb/Utility/XcodeSDK.h
@@ -38,7 +38,7 @@ public:
     watchOS,
     XRSimulator,
     XROS,
-    bridgeOS,
+    BridgeOS,
     Linux,
     unknown = -1
   };
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 8eb64b4..a3d924d 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -27,6 +27,10 @@ from typing import (
     Literal,
 )
 
+# set timeout based on whether ASAN was enabled or not. Increase
+# timeout by a factor of 10 if ASAN is enabled.
+DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
+
 ## DAP type references
 
 
@@ -282,26 +286,24 @@ class DebugCommunication(object):
     def collect_output(
         self,
         category: str,
-        timeout: float,
         pattern: Optional[str] = None,
         clear=True,
     ) -> str:
         """Collect output from 'output' events.
         Args:
             category: The category to collect.
-            timeout: The max duration for collecting output.
             pattern:
                 Optional, if set, return once this pattern is detected in the
                 collected output.
         Returns:
             The collected output.
         """
-        deadline = time.monotonic() + timeout
+        deadline = time.monotonic() + DEFAULT_TIMEOUT
         output = self.get_output(category, clear)
         while deadline >= time.monotonic() and (
             pattern is None or pattern not in output
         ):
-            event = self.wait_for_event(["output"], timeout=deadline - time.monotonic())
+            event = self.wait_for_event(["output"])
             if not event:  # Timeout or EOF
                 break
             output += self.get_output(category, clear=clear)
@@ -339,7 +341,7 @@ class DebugCommunication(object):
         self,
         *,
         predicate: Optional[Callable[[ProtocolMessage], bool]] = None,
-        timeout: Optional[float] = None,
+        timeout: Optional[float] = DEFAULT_TIMEOUT,
     ) -> Optional[ProtocolMessage]:
         """Processes received packets from the adapter.
         Updates the DebugCommunication stateful properties based on the received
@@ -555,25 +557,20 @@ class DebugCommunication(object):
 
         return cast(Optional[Response], self._recv_packet(predicate=predicate))
 
-    def wait_for_event(
-        self, filter: List[str] = [], timeout: Optional[float] = None
-    ) -> Optional[Event]:
+    def wait_for_event(self, filter: List[str] = []) -> Optional[Event]:
         """Wait for the first event that matches the filter."""
 
         def predicate(p: ProtocolMessage):
             return p["type"] == "event" and p["event"] in filter
 
         return cast(
-            Optional[Event], self._recv_packet(predicate=predicate, timeout=timeout)
+            Optional[Event],
+            self._recv_packet(predicate=predicate),
         )
 
-    def wait_for_stopped(
-        self, timeout: Optional[float] = None
-    ) -> Optional[List[Event]]:
+    def wait_for_stopped(self) -> Optional[List[Event]]:
         stopped_events = []
-        stopped_event = self.wait_for_event(
-            filter=["stopped", "exited"], timeout=timeout
-        )
+        stopped_event = self.wait_for_event(filter=["stopped", "exited"])
         while stopped_event:
             stopped_events.append(stopped_event)
             # If we exited, then we are done
@@ -582,26 +579,28 @@ class DebugCommunication(object):
             # Otherwise we stopped and there might be one or more 'stopped'
             # events for each thread that stopped with a reason, so keep
             # checking for more 'stopped' events and return all of them
-            stopped_event = self.wait_for_event(
-                filter=["stopped", "exited"], timeout=0.25
+            # Use a shorter timeout for additional stopped events
+            def predicate(p: ProtocolMessage):
+                return p["type"] == "event" and p["event"] in ["stopped", "exited"]
+
+            stopped_event = cast(
+                Optional[Event], self._recv_packet(predicate=predicate, timeout=0.25)
             )
         return stopped_events
 
-    def wait_for_breakpoint_events(self, timeout: Optional[float] = None):
+    def wait_for_breakpoint_events(self):
         breakpoint_events: list[Event] = []
         while True:
-            event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            event = self.wait_for_event(["breakpoint"])
             if not event:
                 break
             breakpoint_events.append(event)
         return breakpoint_events
 
-    def wait_for_breakpoints_to_be_verified(
-        self, breakpoint_ids: list[str], timeout: Optional[float] = None
-    ):
+    def wait_for_breakpoints_to_be_verified(self, breakpoint_ids: list[str]):
         """Wait for all breakpoints to be verified. Return all unverified breakpoints."""
         while any(id not in self.resolved_breakpoints for id in breakpoint_ids):
-            breakpoint_event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            breakpoint_event = self.wait_for_event(["breakpoint"])
             if breakpoint_event is None:
                 break
 
@@ -614,14 +613,14 @@ class DebugCommunication(object):
             )
         ]
 
-    def wait_for_exited(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event(["exited"], timeout=timeout)
+    def wait_for_exited(self):
+        event_dict = self.wait_for_event(["exited"])
         if event_dict is None:
             raise ValueError("didn't get exited event")
         return event_dict
 
-    def wait_for_terminated(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event(["terminated"], timeout)
+    def wait_for_terminated(self):
+        event_dict = self.wait_for_event(["terminated"])
         if event_dict is None:
             raise ValueError("didn't get terminated event")
         return event_dict
@@ -1610,7 +1609,7 @@ class DebugAdapterServer(DebugCommunication):
                     # new messages will arrive and it should shutdown on its
                     # own.
                     process.stdin.close()
-                    process.wait(timeout=20)
+                    process.wait(timeout=DEFAULT_TIMEOUT)
                 except subprocess.TimeoutExpired:
                     process.kill()
                     process.wait()
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index f7b1ed8..29935bb 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -18,7 +18,7 @@ import base64
 class DAPTestCaseBase(TestBase):
     # set timeout based on whether ASAN was enabled or not. Increase
     # timeout by a factor of 10 if ASAN is enabled.
-    DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
+    DEFAULT_TIMEOUT = dap_server.DEFAULT_TIMEOUT
     NO_DEBUG_INFO_TESTCASE = True
 
     def create_debug_adapter(
@@ -118,11 +118,9 @@ class DAPTestCaseBase(TestBase):
             self.wait_for_breakpoints_to_resolve(breakpoint_ids)
         return breakpoint_ids
 
-    def wait_for_breakpoints_to_resolve(
-        self, breakpoint_ids: list[str], timeout: Optional[float] = DEFAULT_TIMEOUT
-    ):
+    def wait_for_breakpoints_to_resolve(self, breakpoint_ids: list[str]):
         unresolved_breakpoints = self.dap_server.wait_for_breakpoints_to_be_verified(
-            breakpoint_ids, timeout
+            breakpoint_ids
         )
         self.assertEqual(
             len(unresolved_breakpoints),
@@ -134,11 +132,10 @@ class DAPTestCaseBase(TestBase):
         self,
         predicate: Callable[[], bool],
         delay: float = 0.5,
-        timeout: float = DEFAULT_TIMEOUT,
     ) -> bool:
         """Repeatedly run the predicate until either the predicate returns True
         or a timeout has occurred."""
-        deadline = time.monotonic() + timeout
+        deadline = time.monotonic() + self.DEFAULT_TIMEOUT
         while deadline > time.monotonic():
             if predicate():
                 return True
@@ -155,15 +152,13 @@ class DAPTestCaseBase(TestBase):
         if key in self.dap_server.capabilities:
             self.assertEqual(self.dap_server.capabilities[key], False, msg)
 
-    def verify_breakpoint_hit(
-        self, breakpoint_ids: List[Union[int, str]], timeout: float = DEFAULT_TIMEOUT
-    ):
+    def verify_breakpoint_hit(self, breakpoint_ids: List[Union[int, str]]):
         """Wait for the process we are debugging to stop, and verify we hit
         any breakpoint location in the "breakpoint_ids" array.
         "breakpoint_ids" should be a list of breakpoint ID strings
         (["1", "2"]). The return value from self.set_source_breakpoints()
         or self.set_function_breakpoints() can be passed to this function"""
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         normalized_bp_ids = [str(b) for b in breakpoint_ids]
         for stopped_event in stopped_events:
             if "body" in stopped_event:
@@ -186,11 +181,11 @@ class DAPTestCaseBase(TestBase):
             f"breakpoint not hit, wanted breakpoint_ids {breakpoint_ids} in stopped_events {stopped_events}",
         )
 
-    def verify_all_breakpoints_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
+    def verify_all_breakpoints_hit(self, breakpoint_ids):
         """Wait for the process we are debugging to stop, and verify we hit
         all of the breakpoint locations in the "breakpoint_ids" array.
         "breakpoint_ids" should be a list of int breakpoint IDs ([1, 2])."""
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         for stopped_event in stopped_events:
             if "body" in stopped_event:
                 body = stopped_event["body"]
@@ -208,12 +203,12 @@ class DAPTestCaseBase(TestBase):
                     return
         self.assertTrue(False, f"breakpoints not hit, stopped_events={stopped_events}")
 
-    def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT):
+    def verify_stop_exception_info(self, expected_description):
         """Wait for the process we are debugging to stop, and verify the stop
         reason is 'exception' and that the description matches
         'expected_description'
         """
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         for stopped_event in stopped_events:
             if "body" in stopped_event:
                 body = stopped_event["body"]
@@ -338,26 +333,14 @@ class DAPTestCaseBase(TestBase):
     def get_important(self):
         return self.dap_server.get_output("important")
 
-    def collect_stdout(
-        self, timeout: float = DEFAULT_TIMEOUT, pattern: Optional[str] = None
-    ) -> str:
-        return self.dap_server.collect_output(
-            "stdout", timeout=timeout, pattern=pattern
-        )
+    def collect_stdout(self, pattern: Optional[str] = None) -> str:
+        return self.dap_server.collect_output("stdout", pattern=pattern)
 
-    def collect_console(
-        self, timeout: float = DEFAULT_TIMEOUT, pattern: Optional[str] = None
-    ) -> str:
-        return self.dap_server.collect_output(
-            "console", timeout=timeout, pattern=pattern
-        )
+    def collect_console(self, pattern: Optional[str] = None) -> str:
+        return self.dap_server.collect_output("console", pattern=pattern)
 
-    def collect_important(
-        self, timeout: float = DEFAULT_TIMEOUT, pattern: Optional[str] = None
-    ) -> str:
-        return self.dap_server.collect_output(
-            "important", timeout=timeout, pattern=pattern
-        )
+    def collect_important(self, pattern: Optional[str] = None) -> str:
+        return self.dap_server.collect_output("important", pattern=pattern)
 
     def get_local_as_int(self, name, threadId=None):
         value = self.dap_server.get_local_variable_value(name, threadId=threadId)
@@ -393,14 +376,13 @@ class DAPTestCaseBase(TestBase):
         targetId=None,
         waitForStop=True,
         granularity="statement",
-        timeout=DEFAULT_TIMEOUT,
     ):
         response = self.dap_server.request_stepIn(
             threadId=threadId, targetId=targetId, granularity=granularity
         )
         self.assertTrue(response["success"])
         if waitForStop:
-            return self.dap_server.wait_for_stopped(timeout)
+            return self.dap_server.wait_for_stopped()
         return None
 
     def stepOver(
@@ -408,7 +390,6 @@ class DAPTestCaseBase(TestBase):
         threadId=None,
         waitForStop=True,
         granularity="statement",
-        timeout=DEFAULT_TIMEOUT,
     ):
         response = self.dap_server.request_next(
             threadId=threadId, granularity=granularity
@@ -417,40 +398,40 @@ class DAPTestCaseBase(TestBase):
             response["success"], f"next request failed: response {response}"
         )
         if waitForStop:
-            return self.dap_server.wait_for_stopped(timeout)
+            return self.dap_server.wait_for_stopped()
         return None
 
-    def stepOut(self, threadId=None, waitForStop=True, timeout=DEFAULT_TIMEOUT):
+    def stepOut(self, threadId=None, waitForStop=True):
         self.dap_server.request_stepOut(threadId=threadId)
         if waitForStop:
-            return self.dap_server.wait_for_stopped(timeout)
+            return self.dap_server.wait_for_stopped()
         return None
 
     def do_continue(self):  # `continue` is a keyword.
         resp = self.dap_server.request_continue()
         self.assertTrue(resp["success"], f"continue request failed: {resp}")
 
-    def continue_to_next_stop(self, timeout=DEFAULT_TIMEOUT):
+    def continue_to_next_stop(self):
         self.do_continue()
-        return self.dap_server.wait_for_stopped(timeout)
+        return self.dap_server.wait_for_stopped()
 
-    def continue_to_breakpoint(self, breakpoint_id: str, timeout=DEFAULT_TIMEOUT):
-        self.continue_to_breakpoints((breakpoint_id), timeout)
+    def continue_to_breakpoint(self, breakpoint_id: str):
+        self.continue_to_breakpoints((breakpoint_id))
 
-    def continue_to_breakpoints(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
+    def continue_to_breakpoints(self, breakpoint_ids):
         self.do_continue()
-        self.verify_breakpoint_hit(breakpoint_ids, timeout)
+        self.verify_breakpoint_hit(breakpoint_ids)
 
-    def continue_to_exception_breakpoint(self, filter_label, timeout=DEFAULT_TIMEOUT):
+    def continue_to_exception_breakpoint(self, filter_label):
         self.do_continue()
         self.assertTrue(
-            self.verify_stop_exception_info(filter_label, timeout),
+            self.verify_stop_exception_info(filter_label),
             'verify we got "%s"' % (filter_label),
         )
 
-    def continue_to_exit(self, exitCode=0, timeout=DEFAULT_TIMEOUT):
+    def continue_to_exit(self, exitCode=0):
         self.do_continue()
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         self.assertEqual(
             len(stopped_events), 1, "stopped_events = {}".format(stopped_events)
         )
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
index aea6b9f..5ba642b 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
@@ -931,6 +931,7 @@ class GdbRemoteTestCaseBase(Base, metaclass=GdbRemoteTestCaseFactory):
         "QNonStop",
         "SupportedWatchpointTypes",
         "SupportedCompressions",
+        "MultiMemRead",
     ]
 
     def parse_qSupported_response(self, context):
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
index 6d8f41a..460c503 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
@@ -260,6 +260,7 @@ bool ClassDescriptorV2::method_list_t::Read(Process *process,
   uint32_t entsize = extractor.GetU32_unchecked(&cursor);
   m_is_small = (entsize & 0x80000000) != 0;
   m_has_direct_selector = (entsize & 0x40000000) != 0;
+  m_has_relative_types = (entsize & 0x20000000) != 0;
   m_entsize = entsize & 0xfffc;
   m_count = extractor.GetU32_unchecked(&cursor);
   m_first_ptr = addr + cursor;
@@ -269,8 +270,9 @@ bool ClassDescriptorV2::method_list_t::Read(Process *process,
 
 llvm::SmallVector<ClassDescriptorV2::method_t, 0>
 ClassDescriptorV2::ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
-                               lldb::addr_t relative_selector_base_addr,
-                               bool is_small, bool has_direct_sel) const {
+                               lldb::addr_t relative_string_base_addr,
+                               bool is_small, bool has_direct_sel,
+                               bool has_relative_types) const {
   lldb_private::Process *process = m_runtime.GetProcess();
   if (!process)
     return {};
@@ -297,8 +299,8 @@ ClassDescriptorV2::ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
                             process->GetByteOrder(),
                             process->GetAddressByteSize());
     methods.push_back(method_t());
-    methods.back().Read(extractor, process, addr, relative_selector_base_addr,
-                        is_small, has_direct_sel);
+    methods.back().Read(extractor, process, addr, relative_string_base_addr,
+                        is_small, has_direct_sel, has_relative_types);
   }
 
   return methods;
@@ -306,8 +308,9 @@ ClassDescriptorV2::ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
 
 bool ClassDescriptorV2::method_t::Read(DataExtractor &extractor,
                                        Process *process, lldb::addr_t addr,
-                                       lldb::addr_t relative_selector_base_addr,
-                                       bool is_small, bool has_direct_sel) {
+                                       lldb::addr_t relative_string_base_addr,
+                                       bool is_small, bool has_direct_sel,
+                                       bool has_relative_types) {
   lldb::offset_t cursor = 0;
 
   if (is_small) {
@@ -323,10 +326,13 @@ bool ClassDescriptorV2::method_t::Read(DataExtractor &extractor,
       m_name_ptr = process->ReadPointerFromMemory(m_name_ptr, error);
       if (error.Fail())
         return false;
-    } else if (relative_selector_base_addr != LLDB_INVALID_ADDRESS) {
-      m_name_ptr = relative_selector_base_addr + nameref_offset;
+    } else if (relative_string_base_addr != LLDB_INVALID_ADDRESS) {
+      m_name_ptr = relative_string_base_addr + nameref_offset;
     }
-    m_types_ptr = addr + 4 + types_offset;
+    if (has_relative_types)
+      m_types_ptr = relative_string_base_addr + types_offset;
+    else
+      m_types_ptr = addr + 4 + types_offset;
     m_imp_ptr = addr + 8 + imp_offset;
   } else {
     m_name_ptr = extractor.GetAddress_unchecked(&cursor);
@@ -481,7 +487,8 @@ bool ClassDescriptorV2::ProcessMethodList(
 
   llvm::SmallVector<method_t, 0> methods =
       ReadMethods(addresses, m_runtime.GetRelativeSelectorBaseAddr(),
-                  method_list.m_is_small, method_list.m_has_direct_selector);
+                  method_list.m_is_small, method_list.m_has_direct_selector,
+                  method_list.m_has_relative_types);
 
   for (const auto &method : methods)
     if (instance_method_func(method.m_name.c_str(), method.m_types.c_str()))
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
index 78b3311..0fff9af 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
@@ -143,6 +143,7 @@ private:
     uint16_t m_entsize;
     bool m_is_small;
     bool m_has_direct_selector;
+    bool m_has_relative_types;
     uint32_t m_count;
     lldb::addr_t m_first_ptr;
 
@@ -173,14 +174,14 @@ private:
     }
 
     bool Read(DataExtractor &extractor, Process *process, lldb::addr_t addr,
-              lldb::addr_t relative_selector_base_addr, bool is_small,
-              bool has_direct_sel);
+              lldb::addr_t relative_string_base_addr, bool is_small,
+              bool has_direct_sel, bool has_relative_types);
   };
 
   llvm::SmallVector<method_t, 0>
   ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
-              lldb::addr_t relative_selector_base_addr, bool is_small,
-              bool has_direct_sel) const;
+              lldb::addr_t relative_string_base_addr, bool is_small,
+              bool has_direct_sel, bool has_relative_types) const;
 
   struct ivar_list_t {
     uint32_t m_entsize;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index cd72454..5aad447 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -1150,7 +1150,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType(
     case XcodeSDK::Type::XRSimulator:
     case XcodeSDK::Type::XROS:
       // FIXME: Pass the right argument once it exists.
-    case XcodeSDK::Type::bridgeOS:
+    case XcodeSDK::Type::BridgeOS:
     case XcodeSDK::Type::Linux:
     case XcodeSDK::Type::unknown:
       if (Log *log = GetLog(LLDBLog::Host)) {
diff --git a/lldb/source/Utility/XcodeSDK.cpp b/lldb/source/Utility/XcodeSDK.cpp
index 2040791..89e05de 100644
--- a/lldb/source/Utility/XcodeSDK.cpp
+++ b/lldb/source/Utility/XcodeSDK.cpp
@@ -38,8 +38,8 @@ static llvm::StringRef GetName(XcodeSDK::Type type) {
     return "XRSimulator";
   case XcodeSDK::XROS:
     return "XROS";
-  case XcodeSDK::bridgeOS:
-    return "bridgeOS";
+  case XcodeSDK::BridgeOS:
+    return "BridgeOS";
   case XcodeSDK::Linux:
     return "Linux";
   case XcodeSDK::unknown:
@@ -83,8 +83,8 @@ static XcodeSDK::Type ParseSDKName(llvm::StringRef &name) {
     return XcodeSDK::XRSimulator;
   if (name.consume_front("XROS"))
     return XcodeSDK::XROS;
-  if (name.consume_front("bridgeOS"))
-    return XcodeSDK::bridgeOS;
+  if (name.consume_front("BridgeOS"))
+    return XcodeSDK::BridgeOS;
   if (name.consume_front("Linux"))
     return XcodeSDK::Linux;
   static_assert(XcodeSDK::Linux == XcodeSDK::numSDKTypes - 1,
@@ -204,7 +204,7 @@ std::string XcodeSDK::GetCanonicalName(XcodeSDK::Info info) {
   case XROS:
     name = "xros";
     break;
-  case bridgeOS:
+  case BridgeOS:
     name = "bridgeos";
     break;
   case Linux:
diff --git a/lldb/test/API/tools/lldb-dap/attach-commands/TestDAP_attachCommands.py b/lldb/test/API/tools/lldb-dap/attach-commands/TestDAP_attachCommands.py
index ed373f2..9e29f07 100644
--- a/lldb/test/API/tools/lldb-dap/attach-commands/TestDAP_attachCommands.py
+++ b/lldb/test/API/tools/lldb-dap/attach-commands/TestDAP_attachCommands.py
@@ -71,7 +71,7 @@ class TestDAP_attachCommands(lldbdap_testcase.DAPTestCaseBase):
         breakpoint_ids = self.set_function_breakpoints(functions)
         self.assertEqual(len(breakpoint_ids), len(functions), "expect one breakpoint")
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.collect_console(timeout=10, pattern=stopCommands[-1])
+        output = self.collect_console(pattern=stopCommands[-1])
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue after launch and hit the "pause()" call and stop the target.
@@ -81,7 +81,7 @@ class TestDAP_attachCommands(lldbdap_testcase.DAPTestCaseBase):
         time.sleep(0.5)
         self.dap_server.request_pause()
         self.dap_server.wait_for_stopped()
-        output = self.collect_console(timeout=10, pattern=stopCommands[-1])
+        output = self.collect_console(pattern=stopCommands[-1])
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
@@ -90,7 +90,6 @@ class TestDAP_attachCommands(lldbdap_testcase.DAPTestCaseBase):
         # "exitCommands" that were run after the second breakpoint was hit
         # and the "terminateCommands" due to the debugging session ending
         output = self.collect_console(
-            timeout=10.0,
             pattern=terminateCommands[0],
         )
         self.verify_commands("exitCommands", output, exitCommands)
@@ -141,7 +140,6 @@ class TestDAP_attachCommands(lldbdap_testcase.DAPTestCaseBase):
         # "terminateCommands"
         self.dap_server.request_disconnect(terminateDebuggee=True)
         output = self.collect_console(
-            timeout=1.0,
             pattern=terminateCommands[0],
         )
         self.verify_commands("terminateCommands", output, terminateCommands)
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
index 151ad76..beab4d6 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
@@ -82,14 +82,14 @@ class TestDAP_breakpointEvents(lldbdap_testcase.DAPTestCaseBase):
             )
 
         # Flush the breakpoint events.
-        self.dap_server.wait_for_breakpoint_events(timeout=5)
+        self.dap_server.wait_for_breakpoint_events()
 
         # Continue to the breakpoint
         self.continue_to_breakpoints(dap_breakpoint_ids)
 
         verified_breakpoint_ids = []
         unverified_breakpoint_ids = []
-        for breakpoint_event in self.dap_server.wait_for_breakpoint_events(timeout=5):
+        for breakpoint_event in self.dap_server.wait_for_breakpoint_events():
             breakpoint = breakpoint_event["body"]["breakpoint"]
             id = breakpoint["id"]
             if breakpoint["verified"]:
diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
index e722fce..14789a6 100644
--- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
+++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
@@ -46,7 +46,7 @@ class TestDAP_cancel(lldbdap_testcase.DAPTestCaseBase):
 
         # Use a relatively short timeout since this is only to ensure the
         # following request is queued.
-        blocking_seq = self.async_blocking_request(duration=1.0)
+        blocking_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 10)
         # Use a longer timeout to ensure we catch if the request was interrupted
         # properly.
         pending_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2)
diff --git a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
index e61d248..f53813a 100644
--- a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
+++ b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
@@ -23,7 +23,6 @@ class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
             exitCommands=["?" + command_quiet, command_not_quiet],
         )
         full_output = self.collect_console(
-            timeout=1.0,
             pattern=command_not_quiet,
         )
         self.assertNotIn(command_quiet, full_output)
@@ -51,7 +50,6 @@ class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
             expectFailure=True,
         )
         full_output = self.collect_console(
-            timeout=1.0,
             pattern=command_abort_on_error,
         )
         self.assertNotIn(command_quiet, full_output)
diff --git a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
index af5c62a..9fbe9aa 100644
--- a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
+++ b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
@@ -44,7 +44,7 @@ class TestDAP_io(lldbdap_testcase.DAPTestCaseBase):
         """
         process = self.launch()
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_SUCCESS)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_SUCCESS)
 
     def test_invalid_header(self):
         """
@@ -54,7 +54,7 @@ class TestDAP_io(lldbdap_testcase.DAPTestCaseBase):
         process = self.launch()
         process.stdin.write(b"not the correct message header")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
 
     def test_partial_header(self):
         """
@@ -64,7 +64,7 @@ class TestDAP_io(lldbdap_testcase.DAPTestCaseBase):
         process = self.launch()
         process.stdin.write(b"Content-Length: ")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
 
     def test_incorrect_content_length(self):
         """
@@ -74,7 +74,7 @@ class TestDAP_io(lldbdap_testcase.DAPTestCaseBase):
         process = self.launch()
         process.stdin.write(b"Content-Length: abc")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
 
     def test_partial_content_length(self):
         """
@@ -84,4 +84,4 @@ class TestDAP_io(lldbdap_testcase.DAPTestCaseBase):
         process = self.launch()
         process.stdin.write(b"Content-Length: 10\r\n\r\n{")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index ceef95df..8db2316 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -632,7 +632,27 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         program = self.getBuildArtifact("a.out")
 
         with tempfile.NamedTemporaryFile("rt") as f:
-            self.launch(program, stdio=[None, f.name, None])
+            self.launch(program, stdio=[None, f.name])
+            self.continue_to_exit()
+            lines = f.readlines()
+            self.assertIn(
+                program, lines[0], "make sure program path is in first argument"
+            )
+
+    @skipIfAsan
+    @skipIfWindows
+    @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
+    def test_stdio_redirection_and_console(self):
+        """
+        Test stdio redirection and console.
+        """
+        self.build_and_create_debug_adapter()
+        program = self.getBuildArtifact("a.out")
+
+        with tempfile.NamedTemporaryFile("rt") as f:
+            self.launch(
+                program, console="integratedTerminal", stdio=[None, f.name, None]
+            )
             self.continue_to_exit()
             lines = f.readlines()
             self.assertIn(
diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
index bb835af..1f4afab 100644
--- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
+++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
@@ -23,15 +23,15 @@ class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_breakpoints(breakpoint_ids)
 
         # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events.
-        event = self.dap_server.wait_for_event(["module"], 0.25)
+        event = self.dap_server.wait_for_event(["module"])
         while event is not None:
-            event = self.dap_server.wait_for_event(["module"], 0.25)
+            event = self.dap_server.wait_for_event(["module"])
 
         # Continue to the second breakpoint, before the dlclose.
         self.continue_to_breakpoints(breakpoint_ids)
 
         # Make sure we got a module event for libother.
-        event = self.dap_server.wait_for_event(["module"], 5)
+        event = self.dap_server.wait_for_event(["module"])
         self.assertIsNotNone(event, "didn't get a module event")
         module_name = event["body"]["module"]["name"]
         module_id = event["body"]["module"]["id"]
@@ -42,7 +42,7 @@ class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_breakpoints(breakpoint_ids)
 
         # Make sure we got a module event for libother.
-        event = self.dap_server.wait_for_event(["module"], 5)
+        event = self.dap_server.wait_for_event(["module"])
         self.assertIsNotNone(event, "didn't get a module event")
         reason = event["body"]["reason"]
         self.assertEqual(reason, "removed")
@@ -55,8 +55,4 @@ class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase):
         self.assertListEqual(list(module_data.keys()), required_keys)
         self.assertEqual(module_data["name"], "", "expects empty name.")
 
-        # Make sure we do not send another event
-        event = self.dap_server.wait_for_event(["module"], 3)
-        self.assertIsNone(event, "expects no events.")
-
         self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index c5a6837..0ed53da 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -67,7 +67,7 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
         # Collect all the module names we saw as events.
         module_new_names = []
         module_changed_names = []
-        module_event = self.dap_server.wait_for_event(["module"], 1)
+        module_event = self.dap_server.wait_for_event(["module"])
         while module_event is not None:
             reason = module_event["body"]["reason"]
             if reason == "new":
@@ -75,7 +75,7 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
             elif reason == "changed":
                 module_changed_names.append(module_event["body"]["module"]["name"])
 
-            module_event = self.dap_server.wait_for_event(["module"], 1)
+            module_event = self.dap_server.wait_for_event(["module"])
 
         # Make sure we got an event for every active module.
         self.assertNotEqual(len(module_new_names), 0)
diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
index fe978a9..0065258 100644
--- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
+++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
@@ -29,7 +29,7 @@ class TestDAP_output(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_breakpoints(breakpoint_ids)
 
         # Ensure partial messages are still sent.
-        output = self.collect_stdout(timeout=1.0, pattern="abcdef")
+        output = self.collect_stdout(pattern="abcdef")
         self.assertTrue(output and len(output) > 0, "expect program stdout")
 
         self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
index 6748379..e1ad142 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
@@ -105,7 +105,7 @@ class TestDAP_restart_console(lldbdap_testcase.DAPTestCaseBase):
         # Restart and check that we still get a stopped event before reaching
         # main.
         self.dap_server.request_restart()
-        stopped_events = self.dap_server.wait_for_stopped(timeout=20)
+        stopped_events = self.dap_server.wait_for_stopped()
         self.verify_stopped_on_entry(stopped_events)
 
         # continue to main
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
index 7738913..e7d9b89 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
@@ -57,7 +57,7 @@ SetupIORedirection(const std::vector<std::optional<std::string>> &stdio,
   size_t n = std::max(stdio.size(), static_cast<size_t>(3));
   for (size_t i = 0; i < n; i++) {
     std::optional<std::string> path;
-    if (stdio.size() < i)
+    if (stdio.size() <= i)
       path = stdio.back();
     else
       path = stdio[i];
@@ -107,7 +107,7 @@ RunInTerminal(DAP &dap, const protocol::LaunchRequestArguments &arguments) {
 
   llvm::json::Object reverse_request = CreateRunInTerminalReverseRequest(
       arguments.configuration.program, arguments.args, arguments.env,
-      arguments.cwd, comm_file.m_path, debugger_pid,
+      arguments.cwd, comm_file.m_path, debugger_pid, arguments.stdio,
       arguments.console == protocol::eConsoleExternalTerminal);
   dap.SendReverseRequest<LogFailureResponseHandler>("runInTerminal",
                                                     std::move(reverse_request));
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 4f26599..71e91f8 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -866,7 +866,8 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit) {
 llvm::json::Object CreateRunInTerminalReverseRequest(
     llvm::StringRef program, const std::vector<std::string> &args,
     const llvm::StringMap<std::string> &env, llvm::StringRef cwd,
-    llvm::StringRef comm_file, lldb::pid_t debugger_pid, bool external) {
+    llvm::StringRef comm_file, lldb::pid_t debugger_pid,
+    const std::vector<std::optional<std::string>> &stdio, bool external) {
   llvm::json::Object run_in_terminal_args;
   if (external) {
     // This indicates the IDE to open an external terminal window.
@@ -885,6 +886,18 @@ llvm::json::Object CreateRunInTerminalReverseRequest(
   }
   req_args.push_back("--launch-target");
   req_args.push_back(program.str());
+  if (!stdio.empty()) {
+    req_args.push_back("--stdio");
+    std::stringstream ss;
+    for (const std::optional<std::string> &file : stdio) {
+      if (file)
+        ss << *file;
+      ss << ":";
+    }
+    std::string files = ss.str();
+    files.pop_back();
+    req_args.push_back(std::move(files));
+  }
   req_args.insert(req_args.end(), args.begin(), args.end());
   run_in_terminal_args.try_emplace("args", req_args);
 
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index e9094f6..0c865a3 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -388,6 +388,10 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit);
 ///     launcher uses it on Linux tell the kernel that it should allow the
 ///     debugger process to attach.
 ///
+/// \param[in] stdio
+///     An array of file paths for redirecting the program's standard IO
+///     streams.
+///
 /// \param[in] external
 ///     If set to true, the program will run in an external terminal window
 ///     instead of IDE's integrated terminal.
@@ -398,7 +402,8 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit);
 llvm::json::Object CreateRunInTerminalReverseRequest(
     llvm::StringRef program, const std::vector<std::string> &args,
     const llvm::StringMap<std::string> &env, llvm::StringRef cwd,
-    llvm::StringRef comm_file, lldb::pid_t debugger_pid, bool external);
+    llvm::StringRef comm_file, lldb::pid_t debugger_pid,
+    const std::vector<std::optional<std::string>> &stdio, bool external);
 
 /// Create a "Terminated" JSON object that contains statistics
 ///
diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/Options.td
index c8492c6..5e9dd7a 100644
--- a/lldb/tools/lldb-dap/Options.td
+++ b/lldb/tools/lldb-dap/Options.td
@@ -47,6 +47,12 @@ def debugger_pid: S<"debugger-pid">,
   HelpText<"The PID of the lldb-dap instance that sent the launchInTerminal "
     "request when using --launch-target.">;
 
+def stdio: S<"stdio">,
+  MetaVarName<"<stdin:stdout:stderr:...>">,
+  HelpText<"An array of file paths for redirecting the program's standard IO "
+    "streams. A colon-separated list of entries. Empty value means no "
+    "redirection.">;
+
 def repl_mode
     : S<"repl-mode">,
       MetaVarName<"<mode>">,
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 92dada2..a85a68b 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -300,6 +300,7 @@ struct LaunchRequestArguments {
   /// terminal or external terminal.
   Console console = eConsoleInternal;
 
+  /// An array of file paths for redirecting the program's standard IO streams.
   std::vector<std::optional<std::string>> stdio;
 
   /// @}
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index e961c2e..3f0f150 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -626,7 +626,10 @@
               "stdio": {
                 "type": "array",
                 "items": {
-                  "type": "string"
+                  "type": [
+                    "string",
+                    "null"
+                  ]
                 },
                 "description": "The stdio property specifies the redirection targets for the debuggee's stdio streams. A null value redirects a stream to the default debug terminal. String can be a path to file, named pipe or TTY device. If less than three values are provided, the list will be padded with the last value. Specifying more than three values will create additional file descriptors (4, 5, etc.).",
                 "default": []
diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
index 93446c0..e59cef9 100644
--- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
@@ -16,6 +16,7 @@
 #include "lldb/API/SBStream.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/File.h"
+#include "lldb/Host/FileSystem.h"
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Host/MainLoopBase.h"
 #include "lldb/Host/MemoryMonitor.h"
@@ -24,7 +25,9 @@
 #include "lldb/Utility/UriParser.h"
 #include "lldb/lldb-forward.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Option/Arg.h"
@@ -42,8 +45,10 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <condition_variable>
+#include <cstddef>
 #include <cstdio>
 #include <cstdlib>
+#include <exception>
 #include <fcntl.h>
 #include <map>
 #include <memory>
@@ -143,6 +148,74 @@ static void PrintVersion() {
   llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n';
 }
 
+#if not defined(_WIN32)
+struct FDGroup {
+  int GetFlags() const {
+    if (read && write)
+      return O_NOCTTY | O_CREAT | O_RDWR;
+    if (read)
+      return O_NOCTTY | O_RDONLY;
+    return O_NOCTTY | O_CREAT | O_WRONLY | O_TRUNC;
+  }
+
+  std::vector<int> fds;
+  bool read = false;
+  bool write = false;
+};
+
+static llvm::Error RedirectToFile(const FDGroup &fdg, llvm::StringRef file) {
+  if (!fdg.read && !fdg.write)
+    return llvm::Error::success();
+  int target_fd = lldb_private::FileSystem::Instance().Open(
+      file.str().c_str(), fdg.GetFlags(), 0666);
+  if (target_fd == -1)
+    return llvm::errorCodeToError(
+        std::error_code(errno, std::generic_category()));
+  for (int fd : fdg.fds) {
+    if (target_fd == fd)
+      continue;
+    if (::dup2(target_fd, fd) == -1)
+      return llvm::errorCodeToError(
+          std::error_code(errno, std::generic_category()));
+  }
+  ::close(target_fd);
+  return llvm::Error::success();
+}
+
+static llvm::Error
+SetupIORedirection(const llvm::SmallVectorImpl<llvm::StringRef> &files) {
+  llvm::SmallDenseMap<llvm::StringRef, FDGroup> groups;
+  for (size_t i = 0; i < files.size(); i++) {
+    if (files[i].empty())
+      continue;
+    auto group = groups.find(files[i]);
+    if (group == groups.end())
+      group = groups.insert({files[i], {{static_cast<int>(i)}}}).first;
+    else
+      group->second.fds.push_back(i);
+    switch (i) {
+    case 0:
+      group->second.read = true;
+      break;
+    case 1:
+    case 2:
+      group->second.write = true;
+      break;
+    default:
+      group->second.read = true;
+      group->second.write = true;
+      break;
+    }
+  }
+  for (const auto &[file, group] : groups) {
+    if (llvm::Error err = RedirectToFile(group, file))
+      return llvm::createStringError(
+          llvm::formatv("{0}: {1}", file, llvm::toString(std::move(err))));
+  }
+  return llvm::Error::success();
+}
+#endif
+
 // If --launch-target is provided, this instance of lldb-dap becomes a
 // runInTerminal launcher. It will ultimately launch the program specified in
 // the --launch-target argument, which is the original program the user wanted
@@ -165,6 +238,7 @@ static void PrintVersion() {
 static llvm::Error LaunchRunInTerminalTarget(llvm::opt::Arg &target_arg,
                                              llvm::StringRef comm_file,
                                              lldb::pid_t debugger_pid,
+                                             llvm::StringRef stdio,
                                              char *argv[]) {
 #if defined(_WIN32)
   return llvm::createStringError(
@@ -179,6 +253,16 @@ static llvm::Error LaunchRunInTerminalTarget(llvm::opt::Arg &target_arg,
     (void)prctl(PR_SET_PTRACER, debugger_pid, 0, 0, 0);
 #endif
 
+  lldb_private::FileSystem::Initialize();
+  if (!stdio.empty()) {
+    llvm::SmallVector<llvm::StringRef, 3> files;
+    stdio.split(files, ':');
+    while (files.size() < 3)
+      files.push_back(files.back());
+    if (llvm::Error err = SetupIORedirection(files))
+      return err;
+  }
+
   RunInTerminalLauncherCommChannel comm_channel(comm_file);
   if (llvm::Error err = comm_channel.NotifyPid())
     return err;
@@ -484,9 +568,10 @@ int main(int argc, char *argv[]) {
           break;
         }
       }
+      llvm::StringRef stdio = input_args.getLastArgValue(OPT_stdio);
       if (llvm::Error err =
               LaunchRunInTerminalTarget(*target_arg, comm_file->getValue(), pid,
-                                        argv + target_args_pos)) {
+                                        stdio, argv + target_args_pos)) {
         llvm::errs() << llvm::toString(std::move(err)) << '\n';
         return EXIT_FAILURE;
       }
diff --git a/lldb/unittests/Utility/XcodeSDKTest.cpp b/lldb/unittests/Utility/XcodeSDKTest.cpp
index de9f91a..a8a597b 100644
--- a/lldb/unittests/Utility/XcodeSDKTest.cpp
+++ b/lldb/unittests/Utility/XcodeSDKTest.cpp
@@ -27,6 +27,7 @@ TEST(XcodeSDKTest, ParseTest) {
   EXPECT_EQ(XcodeSDK("AppleTVOS.sdk").GetType(), XcodeSDK::AppleTVOS);
   EXPECT_EQ(XcodeSDK("WatchSimulator.sdk").GetType(), XcodeSDK::WatchSimulator);
   EXPECT_EQ(XcodeSDK("WatchOS.sdk").GetType(), XcodeSDK::watchOS);
+  EXPECT_EQ(XcodeSDK("BridgeOS.sdk").GetType(), XcodeSDK::BridgeOS);
   EXPECT_EQ(XcodeSDK("XRSimulator.sdk").GetType(), XcodeSDK::XRSimulator);
   EXPECT_EQ(XcodeSDK("XROS.sdk").GetType(), XcodeSDK::XROS);
   EXPECT_EQ(XcodeSDK("Linux.sdk").GetType(), XcodeSDK::Linux);
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 91dcd5c8..f253e02f 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -746,3 +746,92 @@ Examples:
        @llvm.dx.resource.load.cbufferrow.8(
            target("dx.CBuffer", target("dx.Layout", {i16}, 2, 0)) %buffer,
            i32 %index)
+
+Resource dimensions
+-------------------
+
+*relevant types: Textures and Buffer*
+
+The `getDimensions`_ DXIL operation returns the dimensions of a texture or
+buffer resource. It returns a `Dimensions`_ type, which is a struct
+containing four ``i32`` values. The values in the struct represent the size
+of each dimension of the resource, and when aplicable the number of array
+elements or number of samples. The mapping is defined in the
+`getDimensions`_ documentation.
+
+The LLVM IR representation of this operation has several forms
+depending on the resource type and the specific ``getDimensions`` query.
+The intrinsics return a scalar or anonymous struct with up to 4 `i32`
+elements. The intrinsic names include suffixes to indicate the number of
+elements in the return value. The suffix `.x` indicates a single `i32`
+return value, `.xy` indicates a struct with two `i32` values, and `.xyz`
+indicates a struct with three `i32` values.
+
+Intrinsics representing queries on multisampled texture resources include
+`.ms.` in their name and their return value includes an additional `i32` for
+the number of samples.
+
+Intrinsics with `mip_level` argument and `.levels.` in their name are used
+for texture resources with multiple MIP levels. Their return
+struct includes an additional `i32` for the number of levels the resource has.
+
+.. code-block:: llvm
+
+   i32 @llvm.dx.resource.getdimensions.x( target("dx.*") handle )
+   {i32, i32} @llvm.dx.resource.getdimensions.xy( target("dx.*") handle )
+   {i32, i32, i32} @llvm.dx.resource.getdimensions.xyz( target("dx.*") handle )
+   {i32, i32} @llvm.dx.resource.getdimensions.levels.x( target("dx.*") handle, i32 mip_level )
+   {i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xy( target("dx.*") handle, i32 mip_level )
+   {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xyz( target("dx.*") handle, i32 mip_level )
+   {i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xy( target("dx.*") handle )
+   {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xyz( target("dx.*") handle )
+
+.. list-table:: ``@llvm.dx.resource.getdimensions.*``
+   :header-rows: 1
+
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - `i32`, `{i32, i32}`, `{i32, i32, i32}`, or `{i32, i32, i32, i32}`
+     - Width, height, and depth of the resource (based on the specific suffix), and a number of levels or samples where aplicable.
+   * - ``%handle``
+     - 0
+     - ``target(dx.*)``
+     - Resource handle
+   * - ``%mip_level``
+     - 1
+     - ``i32``
+     - MIP level for the requested dimensions.
+
+Examples:
+
+.. code-block:: llvm
+
+  ; RWBuffer<float4>
+  %dim = call i32 @llvm.dx.resource.getdimensions.x(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %handle)
+
+  ; Texture2D
+  %0 = call {i32, i32} @llvm.dx.resource.getdimensions.xy(target("dx.Texture", ...) %tex2d)
+  %tex2d_width = extractvalue {i32, i32} %0, 0
+  %tex2d_height = extractvalue {i32, i32} %0, 1
+
+  ; Texture2DArray with levels
+  %1 = call {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xyz(
+     target("dx.Texture", ...) %tex2darray, i32 1)
+  %tex2darray_width = extractvalue {i32, i32, i32, i32} %1, 0
+  %tex2darray_height = extractvalue {i32, i32, i32, i32} %1, 1
+  %tex2darray_elem_count = extractvalue {i32, i32, i32, i32} %1, 2
+  %tex2darray_levels_count = extractvalue {i32, i32, i32, i32} %1, 3
+
+  ; Texture2DMS
+  %2 = call {i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xy(
+     target("dx.Texture", ...) %tex2dms)
+  %tex2dms_width = extractvalue {i32, i32, i32} %2, 0
+  %tex2dms_height = extractvalue {i32, i32, i32} %2, 1
+  %tex2dms_samples_count = extractvalue {i32, i32, i32} %2, 2
+
+.. _Dimensions: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-operation-return-types
+.. _getDimensions: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#getdimensions
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 9855444..51318c9 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -383,7 +383,8 @@ LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx,
 
 /// Keep the same scalar or element type as \p TypeIdx, but take the number of
 /// elements from \p Ty.
-LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx, LLT Ty);
+LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx,
+                                               ElementCount EC);
 
 /// Change the scalar size or element size to have the same scalar size as type
 /// index \p FromIndex. Unlike changeElementTo, this discards pointer types and
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
index f9070af..eb71e9a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
@@ -32,8 +32,9 @@ public:
   struct SymbolAddrs {
     ExecutorAddr Allocator;
     ExecutorAddr Reserve;
-    ExecutorAddr Finalize;
-    ExecutorAddr Deallocate;
+    ExecutorAddr Initialize;
+    ExecutorAddr Deinitialize;
+    ExecutorAddr Release;
   };
 
   /// Create an EPCGenericJITLinkMemoryManager instance from a given set of
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
index faec25d..fa48480 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
@@ -31,8 +31,8 @@ public:
   struct SymbolAddrs {
     ExecutorAddr Instance;
     ExecutorAddr Reserve;
-    ExecutorAddr Finalize;
-    ExecutorAddr Deallocate;
+    ExecutorAddr Initialize;
+    ExecutorAddr Release;
     ExecutorAddr RegisterEHFrame;
     ExecutorAddr DeregisterEHFrame;
   };
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
index 99ba456..d68a689 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -29,8 +29,9 @@ LLVM_ABI extern const char *SimpleExecutorDylibManagerResolveWrapperName;
 
 LLVM_ABI extern const char *SimpleExecutorMemoryManagerInstanceName;
 LLVM_ABI extern const char *SimpleExecutorMemoryManagerReserveWrapperName;
-LLVM_ABI extern const char *SimpleExecutorMemoryManagerFinalizeWrapperName;
-LLVM_ABI extern const char *SimpleExecutorMemoryManagerDeallocateWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerInitializeWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerDeinitializeWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerReleaseWrapperName;
 
 LLVM_ABI extern const char *ExecutorSharedMemoryMapperServiceInstanceName;
 LLVM_ABI extern const char *ExecutorSharedMemoryMapperServiceReserveWrapperName;
@@ -73,9 +74,12 @@ using SPSSimpleExecutorDylibManagerResolveSignature = shared::SPSExpected<
 using SPSSimpleExecutorMemoryManagerReserveSignature =
     shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
                                                  uint64_t);
-using SPSSimpleExecutorMemoryManagerFinalizeSignature =
-    shared::SPSError(shared::SPSExecutorAddr, shared::SPSFinalizeRequest);
-using SPSSimpleExecutorMemoryManagerDeallocateSignature = shared::SPSError(
+using SPSSimpleExecutorMemoryManagerInitializeSignature =
+    shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+                                                 shared::SPSFinalizeRequest);
+using SPSSimpleExecutorMemoryManagerDeinitializeSignature = shared::SPSError(
+    shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+using SPSSimpleExecutorMemoryManagerReleaseSignature = shared::SPSError(
     shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
 
 // ExecutorSharedMemoryMapperService
@@ -93,6 +97,18 @@ using SPSExecutorSharedMemoryMapperServiceDeinitializeSignature =
 using SPSExecutorSharedMemoryMapperServiceReleaseSignature = shared::SPSError(
     shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
 
+// SimpleNativeMemoryMap APIs.
+using SPSSimpleRemoteMemoryMapReserveSignature =
+    shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+                                                 uint64_t);
+using SPSSimpleRemoteMemoryMapInitializeSignature =
+    shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+                                                 shared::SPSFinalizeRequest);
+using SPSSimpleRemoteMemoryMapDeinitializeSignature = shared::SPSError(
+    shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+using SPSSimpleRemoteMemoryMapReleaseSignature = shared::SPSError(
+    shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+
 using SPSRunAsMainSignature = int64_t(shared::SPSExecutorAddr,
                                       shared::SPSSequence<shared::SPSString>);
 using SPSRunAsVoidFunctionSignature = int32_t(shared::SPSExecutorAddr);
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
new file mode 100644
index 0000000..644c4f61
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
@@ -0,0 +1,87 @@
+//===- SimpleRemoteMemoryMapper.h - Remote memory mapper --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple memory mapper that uses EPC calls to implement reserve, initialize,
+// deinitialize, and release.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+#define LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+
+#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
+
+namespace llvm::orc {
+
+/// Manages remote memory by making SPS-based EPC calls.
+class LLVM_ABI SimpleRemoteMemoryMapper final : public MemoryMapper {
+public:
+  struct SymbolAddrs {
+    ExecutorAddr Instance;
+    ExecutorAddr Reserve;
+    ExecutorAddr Initialize;
+    ExecutorAddr Deinitialize;
+    ExecutorAddr Release;
+  };
+
+  SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC, SymbolAddrs SAs);
+
+  static Expected<std::unique_ptr<SimpleRemoteMemoryMapper>>
+  Create(ExecutorProcessControl &EPC, SymbolAddrs SAs) {
+    return std::make_unique<SimpleRemoteMemoryMapper>(EPC, SAs);
+  }
+
+  unsigned int getPageSize() override { return EPC.getPageSize(); }
+
+  /// Reserves memory in the remote process by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSExpected<SPSExecutorAddr>(uint64_t Size).
+  ///
+  /// On success, returns the base address of the reserved range.
+  void reserve(size_t NumBytes, OnReservedFunction OnReserved) override;
+
+  char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+                size_t ContentSize) override;
+
+  /// Initializes memory within a previously reserved region (applying
+  /// protections and running any finalization actions) by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSExpected<SPSExecutorAddr>(SPSFinalizeRequest)
+  ///
+  /// On success, returns a key that can be used to deinitialize the region.
+  void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
+
+  /// Given a series of keys from previous initialize calls, deinitialize
+  /// previously initialized memory regions (running dealloc actions, resetting
+  /// permissions and decommitting if possible) by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSError(SPSSequence<SPSExecutorAddr> Keys)
+  ///
+  void deinitialize(ArrayRef<ExecutorAddr> Allocations,
+                    OnDeinitializedFunction OnDeInitialized) override;
+
+  /// Given a sequence of base addresses from previous reserve calls, release
+  /// the underlying ranges (deinitializing any remaining regions within them)
+  /// by calling a remote SPS-wrapper-function with signature
+  ///
+  ///   SPSError(SPSSequence<SPSExecutorAddr> Bases)
+  ///
+  void release(ArrayRef<ExecutorAddr> Reservations,
+               OnReleasedFunction OnRelease) override;
+
+private:
+  ExecutorProcessControl &EPC;
+  SymbolAddrs SAs;
+};
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
index 741f203..6224e92 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
@@ -34,34 +34,65 @@ class LLVM_ABI SimpleExecutorMemoryManager : public ExecutorBootstrapService {
 public:
   virtual ~SimpleExecutorMemoryManager();
 
-  Expected<ExecutorAddr> allocate(uint64_t Size);
-  Error finalize(tpctypes::FinalizeRequest &FR);
-  Error deallocate(const std::vector<ExecutorAddr> &Bases);
+  Expected<ExecutorAddr> reserve(uint64_t Size);
+  Expected<ExecutorAddr> initialize(tpctypes::FinalizeRequest &FR);
+  Error deinitialize(const std::vector<ExecutorAddr> &InitKeys);
+  Error release(const std::vector<ExecutorAddr> &Bases);
 
   Error shutdown() override;
   void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
 
 private:
-  struct Allocation {
+  struct RegionInfo {
     size_t Size = 0;
-    std::vector<shared::WrapperFunctionCall> DeallocationActions;
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
   };
 
-  using AllocationsMap = DenseMap<void *, Allocation>;
+  struct SlabInfo {
+    using RegionMap = std::map<ExecutorAddr, RegionInfo>;
+    size_t Size = 0;
+    RegionMap Regions;
+  };
+
+  using SlabMap = std::map<void *, SlabInfo>;
+
+  /// Get a reference to the slab information for the slab containing the given
+  /// address.
+  Expected<SlabInfo &> getSlabInfo(ExecutorAddr A, StringRef Context);
+
+  /// Get a reference to the slab information for the slab *covering* the given
+  /// range. The given range must be a subrange of e(possibly equal to) the
+  /// range of the slab itself.
+  Expected<SlabInfo &> getSlabInfo(ExecutorAddrRange R, StringRef Context);
 
-  Error deallocateImpl(void *Base, Allocation &A);
+  /// Create a RegionInfo for the given range, which must not overlap any
+  /// existing region.
+  Expected<RegionInfo &> createRegionInfo(ExecutorAddrRange R,
+                                          StringRef Context);
+
+  /// Get a reference to the region information for the given address. This
+  /// address must represent the start of an existing initialized region.
+  Expected<RegionInfo &> getRegionInfo(SlabInfo &Slab, ExecutorAddr A,
+                                       StringRef Context);
+
+  /// Get a reference to the region information for the given address. This
+  /// address must represent the start of an existing initialized region.
+  Expected<RegionInfo &> getRegionInfo(ExecutorAddr A, StringRef Context);
 
   static llvm::orc::shared::CWrapperFunctionResult
   reserveWrapper(const char *ArgData, size_t ArgSize);
 
   static llvm::orc::shared::CWrapperFunctionResult
-  finalizeWrapper(const char *ArgData, size_t ArgSize);
+  initializeWrapper(const char *ArgData, size_t ArgSize);
+
+  static llvm::orc::shared::CWrapperFunctionResult
+  deinitializeWrapper(const char *ArgData, size_t ArgSize);
 
   static llvm::orc::shared::CWrapperFunctionResult
-  deallocateWrapper(const char *ArgData, size_t ArgSize);
+  releaseWrapper(const char *ArgData, size_t ArgSize);
 
   std::mutex M;
-  AllocationsMap Allocations;
+  SlabMap Slabs;
 };
 
 } // end namespace rt_bootstrap
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 570d6bc..3b7077c 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -77,6 +77,9 @@ def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                             [IntrInaccessibleMemOrArgMemOnly]>;
 
+def int_dx_resource_getdimensions_x
+    : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty], [IntrReadMem]>;
+
 // Cast between target extension handle types and dxil-style opaque handles
 def int_dx_resource_casthandle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 66e24fa..49a182be 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -167,6 +167,9 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                               [IntrInaccessibleMemOrArgMemOnly]>;
 
+  def int_spv_resource_getdimensions_x
+      : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty], [IntrReadMem]>;
+
   def int_spv_resource_getpointer
       : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
                               [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 6183a7e..a8b647c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -405,17 +405,19 @@ def MIPS16_RET_DF : RuntimeLibcall;
 def MIPS16_RET_SC : RuntimeLibcall;
 def MIPS16_RET_SF : RuntimeLibcall;
 
-multiclass LibmLongDoubleLibCall<string libcall_basename = !toupper(NAME),
-                                 string rtbasename = NAME> {
+multiclass LibmLongDoubleLibCall<string libcall_basename = !toupper(!substr(NAME, 0, !sub(!size(NAME), 1))),
+                                 string rtname = NAME> {
+
+
   def NAME#"_f128"
       : RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_F128"),
-                           !strconcat(rtbasename, "l")>;
+                           rtname>;
   def NAME#"_ppcf128"
       : RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_PPCF128"),
-                           !strconcat(rtbasename, "l")>;
+                           rtname>;
   def NAME#"_f80"
       : RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_F80"),
-                           !strconcat(rtbasename, "l")>;
+                           rtname>;
 }
 
 // AArch64 calls
@@ -765,19 +767,19 @@ def fmodl_ppc128 : RuntimeLibcallImpl<REM_PPCF128, "fmodl">;
 
 def fmaf : RuntimeLibcallImpl<FMA_F32>;
 def fma : RuntimeLibcallImpl<FMA_F64>;
-defm fma : LibmLongDoubleLibCall;
+defm fmal : LibmLongDoubleLibCall;
 
 def sqrtf : RuntimeLibcallImpl<SQRT_F32>;
 def sqrt : RuntimeLibcallImpl<SQRT_F64>;
-defm sqrt : LibmLongDoubleLibCall;
+defm sqrtl : LibmLongDoubleLibCall;
 
 def cbrtf : RuntimeLibcallImpl<CBRT_F32>;
 def cbrt : RuntimeLibcallImpl<CBRT_F64>;
-defm cbrt : LibmLongDoubleLibCall;
+defm cbrtl : LibmLongDoubleLibCall;
 
 def logf : RuntimeLibcallImpl<LOG_F32>;
 def log : RuntimeLibcallImpl<LOG_F64>;
-defm log : LibmLongDoubleLibCall;
+defm logl : LibmLongDoubleLibCall;
 
 def __logf_finite : RuntimeLibcallImpl<LOG_FINITE_F32>;
 def __log_finite : RuntimeLibcallImpl<LOG_FINITE_F64>;
@@ -787,7 +789,7 @@ def __logl_finite_ppcf128 : RuntimeLibcallImpl<LOG_FINITE_PPCF128, "__logl_finit
 
 def log2f : RuntimeLibcallImpl<LOG2_F32>;
 def log2 : RuntimeLibcallImpl<LOG2_F64>;
-defm log2 : LibmLongDoubleLibCall;
+defm log2l : LibmLongDoubleLibCall;
 
 def __log2f_finite : RuntimeLibcallImpl<LOG2_FINITE_F32>;
 def __log2_finite : RuntimeLibcallImpl<LOG2_FINITE_F64>;
@@ -797,7 +799,7 @@ def __log2l_finite_ppcf128 : RuntimeLibcallImpl<LOG2_FINITE_PPCF128, "__log2l_fi
 
 def log10f : RuntimeLibcallImpl<LOG10_F32>;
 def log10 : RuntimeLibcallImpl<LOG10_F64>;
-defm log10 : LibmLongDoubleLibCall;
+defm log10l : LibmLongDoubleLibCall;
 
 def __log10f_finite : RuntimeLibcallImpl<LOG10_FINITE_F32>;
 def __log10_finite : RuntimeLibcallImpl<LOG10_FINITE_F64>;
@@ -807,7 +809,7 @@ def __log10l_finite_ppcf128 : RuntimeLibcallImpl<LOG10_FINITE_PPCF128, "__log10l
 
 def expf : RuntimeLibcallImpl<EXP_F32>;
 def exp : RuntimeLibcallImpl<EXP_F64>;
-defm exp : LibmLongDoubleLibCall<"EXP", "exp">;
+defm expl : LibmLongDoubleLibCall<"EXP">;
 
 def __expf_finite : RuntimeLibcallImpl<EXP_FINITE_F32>;
 def __exp_finite : RuntimeLibcallImpl<EXP_FINITE_F64>;
@@ -817,7 +819,7 @@ def __expl_finite_ppcf128 : RuntimeLibcallImpl<EXP_FINITE_PPCF128, "__expl_finit
 
 def exp2f : RuntimeLibcallImpl<EXP2_F32>;
 def exp2 : RuntimeLibcallImpl<EXP2_F64>;
-defm exp2 : LibmLongDoubleLibCall<"EXP2", "exp2">;
+defm exp2l : LibmLongDoubleLibCall<"EXP2">;
 
 def __exp2f_finite : RuntimeLibcallImpl<EXP2_FINITE_F32>;
 def __exp2_finite : RuntimeLibcallImpl<EXP2_FINITE_F64>;
@@ -827,47 +829,47 @@ def __exp2l_finite_ppcf128 : RuntimeLibcallImpl<EXP2_FINITE_PPCF128, "__exp2l_fi
 
 def sinf : RuntimeLibcallImpl<SIN_F32>;
 def sin : RuntimeLibcallImpl<SIN_F64>;
-defm sin : LibmLongDoubleLibCall;
+defm sinl : LibmLongDoubleLibCall;
 
 def cosf : RuntimeLibcallImpl<COS_F32>;
 def cos : RuntimeLibcallImpl<COS_F64>;
-defm cos : LibmLongDoubleLibCall;
+defm cosl : LibmLongDoubleLibCall;
 
 def tanf : RuntimeLibcallImpl<TAN_F32>;
 def tan : RuntimeLibcallImpl<TAN_F64>;
-defm tan : LibmLongDoubleLibCall;
+defm tanl : LibmLongDoubleLibCall;
 
 def sinhf : RuntimeLibcallImpl<SINH_F32>;
 def sinh : RuntimeLibcallImpl<SINH_F64>;
-defm sinh : LibmLongDoubleLibCall;
+defm sinhl : LibmLongDoubleLibCall;
 
 def coshf : RuntimeLibcallImpl<COSH_F32>;
 def cosh : RuntimeLibcallImpl<COSH_F64>;
-defm cosh : LibmLongDoubleLibCall;
+defm coshl : LibmLongDoubleLibCall;
 
 def tanhf : RuntimeLibcallImpl<TANH_F32>;
 def tanh : RuntimeLibcallImpl<TANH_F64>;
-defm tanh : LibmLongDoubleLibCall;
+defm tanhl : LibmLongDoubleLibCall;
 
 def asinf : RuntimeLibcallImpl<ASIN_F32>;
 def asin : RuntimeLibcallImpl<ASIN_F64>;
-defm asin : LibmLongDoubleLibCall;
+defm asinl : LibmLongDoubleLibCall;
 
 def acosf : RuntimeLibcallImpl<ACOS_F32>;
 def acos : RuntimeLibcallImpl<ACOS_F64>;
-defm acos : LibmLongDoubleLibCall;
+defm acosl : LibmLongDoubleLibCall;
 
 def atanf : RuntimeLibcallImpl<ATAN_F32>;
 def atan : RuntimeLibcallImpl<ATAN_F64>;
-defm atan : LibmLongDoubleLibCall;
+defm atanl : LibmLongDoubleLibCall;
 
 def atan2f : RuntimeLibcallImpl<ATAN2_F32>;
 def atan2 : RuntimeLibcallImpl<ATAN2_F64>;
-defm atan2 : LibmLongDoubleLibCall;
+defm atan2l : LibmLongDoubleLibCall;
 
 def powf : RuntimeLibcallImpl<POW_F32>;
 def pow : RuntimeLibcallImpl<POW_F64>;
-defm pow : LibmLongDoubleLibCall;
+defm powl : LibmLongDoubleLibCall;
 
 def __powf_finite : RuntimeLibcallImpl<POW_FINITE_F32>;
 def __pow_finite : RuntimeLibcallImpl<POW_FINITE_F64>;
@@ -877,91 +879,91 @@ def __powl_finite_ppcf128 : RuntimeLibcallImpl<POW_FINITE_PPCF128, "__powl_finit
 
 def ceilf : RuntimeLibcallImpl<CEIL_F32>;
 def ceil : RuntimeLibcallImpl<CEIL_F64>;
-defm ceil : LibmLongDoubleLibCall;
+defm ceill : LibmLongDoubleLibCall;
 
 def truncf : RuntimeLibcallImpl<TRUNC_F32>;
 def trunc : RuntimeLibcallImpl<TRUNC_F64>;
-defm trunc : LibmLongDoubleLibCall;
+defm truncl : LibmLongDoubleLibCall;
 
 def rintf : RuntimeLibcallImpl<RINT_F32>;
 def rint : RuntimeLibcallImpl<RINT_F64>;
-defm rint : LibmLongDoubleLibCall;
+defm rintl : LibmLongDoubleLibCall;
 
 def nearbyintf : RuntimeLibcallImpl<NEARBYINT_F32>;
 def nearbyint : RuntimeLibcallImpl<NEARBYINT_F64>;
-defm nearbyint : LibmLongDoubleLibCall;
+defm nearbyintl : LibmLongDoubleLibCall;
 
 def roundf : RuntimeLibcallImpl<ROUND_F32>;
 def round : RuntimeLibcallImpl<ROUND_F64>;
-defm round : LibmLongDoubleLibCall;
+defm roundl : LibmLongDoubleLibCall;
 
 def roundevenf : RuntimeLibcallImpl<ROUNDEVEN_F32>;
 def roundeven : RuntimeLibcallImpl<ROUNDEVEN_F64>;
-defm roundeven : LibmLongDoubleLibCall;
+defm roundevenl : LibmLongDoubleLibCall;
 
 def floorf : RuntimeLibcallImpl<FLOOR_F32>;
 def floor : RuntimeLibcallImpl<FLOOR_F64>;
-defm floor : LibmLongDoubleLibCall;
+defm floorl : LibmLongDoubleLibCall;
 
 def copysignf : RuntimeLibcallImpl<COPYSIGN_F32>;
 def copysign : RuntimeLibcallImpl<COPYSIGN_F64>;
-defm copysign : LibmLongDoubleLibCall;
+defm copysignl : LibmLongDoubleLibCall;
 
 def fminf : RuntimeLibcallImpl<FMIN_F32>;
 def fmin : RuntimeLibcallImpl<FMIN_F64>;
-defm fmin : LibmLongDoubleLibCall;
+defm fminl : LibmLongDoubleLibCall;
 
 def fmaxf : RuntimeLibcallImpl<FMAX_F32>;
 def fmax : RuntimeLibcallImpl<FMAX_F64>;
-defm fmax : LibmLongDoubleLibCall;
+defm fmaxl : LibmLongDoubleLibCall;
 
 def fminimumf : RuntimeLibcallImpl<FMINIMUM_F32>;
 def fminimum : RuntimeLibcallImpl<FMINIMUM_F64>;
-defm fminimum : LibmLongDoubleLibCall;
+defm fminimuml : LibmLongDoubleLibCall;
 
 def fmaximumf : RuntimeLibcallImpl<FMAXIMUM_F32>;
 def fmaximum : RuntimeLibcallImpl<FMAXIMUM_F64>;
-defm fmaximum : LibmLongDoubleLibCall;
+defm fmaximuml : LibmLongDoubleLibCall;
 
 def fminimum_numf : RuntimeLibcallImpl<FMINIMUM_NUM_F32>;
 def fminimum_num : RuntimeLibcallImpl<FMINIMUM_NUM_F64>;
-defm fminimum_num : LibmLongDoubleLibCall;
+defm fminimum_numl : LibmLongDoubleLibCall;
 
 def fmaximum_numf : RuntimeLibcallImpl<FMAXIMUM_NUM_F32>;
 def fmaximum_num : RuntimeLibcallImpl<FMAXIMUM_NUM_F64>;
-defm fmaximum_num : LibmLongDoubleLibCall;
+defm fmaximum_numl : LibmLongDoubleLibCall;
 
 def lroundf : RuntimeLibcallImpl<LROUND_F32>;
 def lround : RuntimeLibcallImpl<LROUND_F64>;
-defm lround : LibmLongDoubleLibCall;
+defm lroundl : LibmLongDoubleLibCall;
 
 def llroundf : RuntimeLibcallImpl<LLROUND_F32>;
 def llround : RuntimeLibcallImpl<LLROUND_F64>;
-defm llround : LibmLongDoubleLibCall;
+defm llroundl : LibmLongDoubleLibCall;
 
 def lrintf : RuntimeLibcallImpl<LRINT_F32>;
 def lrint : RuntimeLibcallImpl<LRINT_F64>;
-defm lrint : LibmLongDoubleLibCall;
+defm lrintl : LibmLongDoubleLibCall;
 
 def llrintf : RuntimeLibcallImpl<LLRINT_F32>;
 def llrint : RuntimeLibcallImpl<LLRINT_F64>;
-defm llrint : LibmLongDoubleLibCall;
+defm llrintl : LibmLongDoubleLibCall;
 
 def ldexpf : RuntimeLibcallImpl<LDEXP_F32>;
 def ldexp : RuntimeLibcallImpl<LDEXP_F64>;
-defm ldexp : LibmLongDoubleLibCall;
+defm ldexpl : LibmLongDoubleLibCall;
 
 def frexpf : RuntimeLibcallImpl<FREXP_F32>;
 def frexp : RuntimeLibcallImpl<FREXP_F64>;
-defm frexp : LibmLongDoubleLibCall;
+defm frexpl : LibmLongDoubleLibCall;
 
 def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
 def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
-defm sincospi : LibmLongDoubleLibCall;
+defm sincospil : LibmLongDoubleLibCall;
 
 def modff : RuntimeLibcallImpl<MODF_F32>;
 def modf : RuntimeLibcallImpl<MODF_F64>;
-defm modf : LibmLongDoubleLibCall;
+defm modfl : LibmLongDoubleLibCall;
 
 // Floating point environment
 def fegetenv : RuntimeLibcallImpl<FEGETENV>;
@@ -1033,7 +1035,7 @@ def __sincos_stret : RuntimeLibcallImpl<SINCOS_STRET_F64>;
 
 def sincosf : RuntimeLibcallImpl<SINCOS_F32>;
 def sincos : RuntimeLibcallImpl<SINCOS_F64>;
-defm sincos : LibmLongDoubleLibCall;
+defm sincosl : LibmLongDoubleLibCall;
 
 def bzero : RuntimeLibcallImpl<BZERO>;
 def __bzero : RuntimeLibcallImpl<BZERO>;
@@ -1198,9 +1200,9 @@ defvar SecurityCheckCookieIfWinMSVC =
 
 defvar LibmHasSinCosF32 = LibcallImpls<(add sincosf), hasSinCos>;
 defvar LibmHasSinCosF64 =  LibcallImpls<(add sincos), hasSinCos>;
-defvar LibmHasSinCosF80 = LibcallImpls<(add sincos_f80), hasSinCos>;
-defvar LibmHasSinCosF128 = LibcallImpls<(add sincos_f128), hasSinCos>;
-defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincos_ppcf128), hasSinCos>;
+defvar LibmHasSinCosF80 = LibcallImpls<(add sincosl_f80), hasSinCos>;
+defvar LibmHasSinCosF128 = LibcallImpls<(add sincosl_f128), hasSinCos>;
+defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincosl_ppcf128), hasSinCos>;
 
 defvar LibmHasExp10F32 = LibcallImpls<(add exp10f), hasExp10>;
 defvar LibmHasExp10F64 = LibcallImpls<(add exp10), hasExp10>;
@@ -1214,8 +1216,8 @@ defvar DefaultLibmExp10 = [
 
 
 defvar WindowsMathRemovals = [
-  ldexpf, ldexp_f80, ldexp_f128, ldexp_ppcf128,
-  frexpf, frexp_f80, frexp_f128, frexp_ppcf128
+  ldexpf, ldexpl_f80, ldexpl_f128, ldexpl_ppcf128,
+  frexpf, frexpl_f80, frexpl_f128, frexpl_ppcf128
 ];
 
 defvar MostPowI = !listremove(PowiLibcallImpls, [__powitf2_f128, __powitf2_ppc128]);
@@ -1233,11 +1235,11 @@ defvar WinDefaultLibcallImpls = (add WinDefaultLibcallImplsBaseList,
 defvar LibmHasFrexpF32 = LibcallImpls<(add frexpf), isNotOSWindowsOrIsCygwinMinGW>;
 defvar LibmHasLdexpF32 = LibcallImpls<(add ldexpf), isNotOSWindowsOrIsCygwinMinGW>;
 
-defvar LibmHasFrexpF80 = LibcallImpls<(add frexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasLdexpF80 = LibcallImpls<(add ldexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasFrexpF80 = LibcallImpls<(add frexpl_f80), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF80 = LibcallImpls<(add ldexpl_f80), isNotOSWindowsOrIsCygwinMinGW>;
 
-defvar LibmHasFrexpF128 = LibcallImpls<(add frexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasLdexpF128 = LibcallImpls<(add ldexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasFrexpF128 = LibcallImpls<(add frexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF128 = LibcallImpls<(add ldexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
 
 defvar has__stack_chk_fail = LibcallImpls<(add __stack_chk_fail), isNotOSOpenBSD>;
 defvar has__stack_chk_guard =
@@ -2459,7 +2461,7 @@ defvar X86CommonLibcalls =
        LibcallImpls<(add __bzero), darwinHas__bzero>,
        LibmHasFrexpF32, LibmHasLdexpF32,
        LibmHasFrexpF80, LibmHasLdexpF80,
-       LibcallImpls<(add frexp_f128, ldexp_f128, exp10l_f128), hasExpFrexplLdexplF128>,
+       LibcallImpls<(add frexpl_f128, ldexpl_f128, exp10l_f128), hasExpFrexplLdexplF128>,
        DefaultRuntimeLibcallImpls_f80,
        LibmHasExp10F32, LibmHasExp10F64, LibmHasExp10F80,
        LibcallImpls<(add MostPowI), isNotOSMSVCRT>,
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
index 25c1db9..ded4df4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -55,12 +55,10 @@ LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
 }
 
 LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
-                                                         LLT NewEltTy) {
+                                                         ElementCount EC) {
   return [=](const LegalityQuery &Query) {
     const LLT OldTy = Query.Types[TypeIdx];
-    ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount()
-                                                   : ElementCount::getFixed(1);
-    return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+    return std::make_pair(TypeIdx, OldTy.changeElementCount(EC));
   };
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 055fdc6..ca82857 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -818,8 +818,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   if (!DefMI)
     return false;
 
-  const TargetMachine& TM = DefMI->getMF()->getTarget();
-  if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
+  if (DefMI->getFlag(MachineInstr::FmNoNans))
     return true;
 
   // If the value is a constant, we can obviously see if it is a NaN or not.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0f2b518..cb0038c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3971,8 +3971,14 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
 }
 
 void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
-  // FIXME: this is not correct for pointers with addr width != pointer width
-  visitPtrToInt(I);
+  SDValue N = getValue(I.getOperand(0));
+  // By definition the type of the ptrtoaddr must be equal to the address type.
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  EVT AddrVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  // The address width must be smaller or equal to the pointer representation
+  // width, so we lower ptrtoaddr as a truncate (possibly folded to a no-op).
+  N = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), AddrVT, N);
+  setValue(&I, N);
 }
 
 void SelectionDAGBuilder::visitPtrToInt(const User &I) {
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 0ffe3ae..f343925 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_component_library(LLVMOrcJIT
   SectCreate.cpp
   SelfExecutorProcessControl.cpp
   SimpleRemoteEPC.cpp
+  SimpleRemoteMemoryMapper.cpp
   Speculation.cpp
   SpeculateAnalyses.cpp
   ExecutorProcessControl.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
index 50e6b25..0833af7 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
@@ -57,16 +57,17 @@ public:
     std::swap(FR.Actions, G.allocActions());
 
     Parent.EPC.callSPSWrapperAsync<
-        rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
-        Parent.SAs.Finalize,
+        rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+        Parent.SAs.Initialize,
         [OnFinalize = std::move(OnFinalize), AllocAddr = this->AllocAddr](
-            Error SerializationErr, Error FinalizeErr) mutable {
+            Error SerializationErr,
+            Expected<ExecutorAddr> InitializeKey) mutable {
           // FIXME: Release abandoned alloc.
           if (SerializationErr) {
-            cantFail(std::move(FinalizeErr));
+            cantFail(InitializeKey.takeError());
             OnFinalize(std::move(SerializationErr));
-          } else if (FinalizeErr)
-            OnFinalize(std::move(FinalizeErr));
+          } else if (!InitializeKey)
+            OnFinalize(InitializeKey.takeError());
           else
             OnFinalize(FinalizedAlloc(AllocAddr));
         },
@@ -76,8 +77,8 @@ public:
   void abandon(OnAbandonedFunction OnAbandoned) override {
     // FIXME: Return memory to pool instead.
     Parent.EPC.callSPSWrapperAsync<
-        rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
-        Parent.SAs.Deallocate,
+        rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+        Parent.SAs.Release,
         [OnAbandoned = std::move(OnAbandoned)](Error SerializationErr,
                                                Error DeallocateErr) mutable {
           if (SerializationErr) {
@@ -123,9 +124,8 @@ void EPCGenericJITLinkMemoryManager::allocate(const JITLinkDylib *JD,
 
 void EPCGenericJITLinkMemoryManager::deallocate(
     std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) {
-  EPC.callSPSWrapperAsync<
-      rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
-      SAs.Deallocate,
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+      SAs.Release,
       [OnDeallocated = std::move(OnDeallocated)](Error SerErr,
                                                  Error DeallocErr) mutable {
         if (SerErr) {
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
index fec7062..cc72488 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
@@ -25,9 +25,9 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols(
   if (auto Err = EPC.getBootstrapSymbols(
           {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
            {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
-           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
-           {SAs.Deallocate,
-            rt::SimpleExecutorMemoryManagerDeallocateWrapperName},
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName},
            {SAs.RegisterEHFrame, rt::RegisterEHFrameSectionAllocActionName},
            {SAs.DeregisterEHFrame,
             rt::DeregisterEHFrameSectionAllocActionName}}))
@@ -48,7 +48,7 @@ EPCGenericRTDyldMemoryManager::~EPCGenericRTDyldMemoryManager() {
 
   Error Err = Error::success();
   if (auto Err2 = EPC.callSPSWrapper<
-                  rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
+                  rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
           SAs.Reserve, Err, SAs.Instance, FinalizedAllocs)) {
     // FIXME: Report errors through EPC once that functionality is available.
     logAllUnhandledErrors(std::move(Err2), errs(), "");
@@ -267,10 +267,10 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
 
     // We'll also need to make an extra allocation for the eh-frame wrapper call
     // arguments.
-    Error FinalizeErr = Error::success();
+    Expected<ExecutorAddr> InitializeKey((ExecutorAddr()));
     if (auto Err = EPC.callSPSWrapper<
-                   rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
-            SAs.Finalize, FinalizeErr, SAs.Instance, std::move(FR))) {
+                   rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+            SAs.Initialize, InitializeKey, SAs.Instance, std::move(FR))) {
       std::lock_guard<std::mutex> Lock(M);
       this->ErrMsg = toString(std::move(Err));
       dbgs() << "Serialization error: " << this->ErrMsg << "\n";
@@ -278,9 +278,9 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
         *ErrMsg = this->ErrMsg;
       return true;
     }
-    if (FinalizeErr) {
+    if (!InitializeKey) {
       std::lock_guard<std::mutex> Lock(M);
-      this->ErrMsg = toString(std::move(FinalizeErr));
+      this->ErrMsg = toString(InitializeKey.takeError());
       dbgs() << "Finalization error: " << this->ErrMsg << "\n";
       if (ErrMsg)
         *ErrMsg = this->ErrMsg;
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index 26e8f53..cc99d3c 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -23,10 +23,12 @@ const char *SimpleExecutorMemoryManagerInstanceName =
     "__llvm_orc_SimpleExecutorMemoryManager_Instance";
 const char *SimpleExecutorMemoryManagerReserveWrapperName =
     "__llvm_orc_SimpleExecutorMemoryManager_reserve_wrapper";
-const char *SimpleExecutorMemoryManagerFinalizeWrapperName =
-    "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper";
-const char *SimpleExecutorMemoryManagerDeallocateWrapperName =
-    "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper";
+const char *SimpleExecutorMemoryManagerInitializeWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_initialize_wrapper";
+const char *SimpleExecutorMemoryManagerDeinitializeWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_deinitialize_wrapper";
+const char *SimpleExecutorMemoryManagerReleaseWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_release_wrapper";
 
 const char *ExecutorSharedMemoryMapperServiceInstanceName =
     "__llvm_orc_ExecutorSharedMemoryMapperService_Instance";
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index 87d7578..dec1df7 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -216,9 +216,9 @@ SimpleRemoteEPC::createDefaultMemoryManager(SimpleRemoteEPC &SREPC) {
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Allocator, rt::SimpleExecutorMemoryManagerInstanceName},
            {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
-           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
-           {SAs.Deallocate,
-            rt::SimpleExecutorMemoryManagerDeallocateWrapperName}}))
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
     return std::move(Err);
 
   return std::make_unique<EPCGenericJITLinkMemoryManager>(SREPC, SAs);
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
new file mode 100644
index 0000000..b82de3f
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
@@ -0,0 +1,104 @@
+//===---- SimpleRemoteMemoryMapper.cpp - Remote memory mapper ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+
+namespace llvm::orc {
+
+SimpleRemoteMemoryMapper::SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC,
+                                                   SymbolAddrs SAs)
+    : EPC(EPC), SAs(SAs) {}
+
+void SimpleRemoteMemoryMapper::reserve(size_t NumBytes,
+                                       OnReservedFunction OnReserved) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReserveSignature>(
+      SAs.Reserve,
+      [NumBytes, OnReserved = std::move(OnReserved)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnReserved(std::move(SerializationErr));
+        }
+
+        if (Result)
+          OnReserved(ExecutorAddrRange(*Result, NumBytes));
+        else
+          OnReserved(Result.takeError());
+      },
+      SAs.Instance, static_cast<uint64_t>(NumBytes));
+}
+
+char *SimpleRemoteMemoryMapper::prepare(jitlink::LinkGraph &G,
+                                        ExecutorAddr Addr, size_t ContentSize) {
+  return G.allocateBuffer(ContentSize).data();
+}
+
+void SimpleRemoteMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
+                                          OnInitializedFunction OnInitialized) {
+
+  tpctypes::FinalizeRequest FR;
+
+  std::swap(FR.Actions, AI.Actions);
+  FR.Segments.reserve(AI.Segments.size());
+
+  for (auto Seg : AI.Segments)
+    FR.Segments.push_back({Seg.AG, AI.MappingBase + Seg.Offset,
+                           Seg.ContentSize + Seg.ZeroFillSize,
+                           ArrayRef<char>(Seg.WorkingMem, Seg.ContentSize)});
+
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapInitializeSignature>(
+      SAs.Initialize,
+      [OnInitialized = std::move(OnInitialized)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnInitialized(std::move(SerializationErr));
+        }
+
+        OnInitialized(std::move(Result));
+      },
+      SAs.Instance, std::move(FR));
+}
+
+void SimpleRemoteMemoryMapper::deinitialize(
+    ArrayRef<ExecutorAddr> Allocations,
+    MemoryMapper::OnDeinitializedFunction OnDeinitialized) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>(
+      SAs.Deinitialize,
+      [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr,
+                                                     Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnDeinitialized(std::move(SerializationErr));
+        }
+
+        OnDeinitialized(std::move(Result));
+      },
+      SAs.Instance, Allocations);
+}
+
+void SimpleRemoteMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
+                                       OnReleasedFunction OnReleased) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReleaseSignature>(
+      SAs.Release,
+      [OnReleased = std::move(OnReleased)](Error SerializationErr,
+                                           Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnReleased(std::move(SerializationErr));
+        }
+
+        return OnReleased(std::move(Result));
+      },
+      SAs.Instance, Bases);
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 3cdffb8..fe881a1 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
 
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -18,166 +19,167 @@ namespace orc {
 namespace rt_bootstrap {
 
 SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() {
-  assert(Allocations.empty() && "shutdown not called?");
+  assert(Slabs.empty() && "shutdown not called?");
 }
 
-Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
+Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) {
   std::error_code EC;
   auto MB = sys::Memory::allocateMappedMemory(
       Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
   if (EC)
     return errorCodeToError(EC);
   std::lock_guard<std::mutex> Lock(M);
-  assert(!Allocations.count(MB.base()) && "Duplicate allocation addr");
-  Allocations[MB.base()].Size = Size;
+  assert(!Slabs.count(MB.base()) && "Duplicate allocation addr");
+  Slabs[MB.base()].Size = Size;
   return ExecutorAddr::fromPtr(MB.base());
 }
 
-Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
-  ExecutorAddr Base(~0ULL);
+Expected<ExecutorAddr>
+SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) {
   std::vector<shared::WrapperFunctionCall> DeallocationActions;
-  size_t SuccessfulFinalizationActions = 0;
 
   if (FR.Segments.empty()) {
-    // NOTE: Finalizing nothing is currently a no-op. Should it be an error?
     if (FR.Actions.empty())
-      return Error::success();
+      return make_error<StringError>("Finalization request is empty",
+                                     inconvertibleErrorCode());
     else
       return make_error<StringError>("Finalization actions attached to empty "
                                      "finalization request",
                                      inconvertibleErrorCode());
   }
 
-  for (auto &Seg : FR.Segments)
-    Base = std::min(Base, Seg.Addr);
-
-  for (auto &ActPair : FR.Actions)
-    if (ActPair.Dealloc)
-      DeallocationActions.push_back(ActPair.Dealloc);
-
-  // Get the Allocation for this finalization.
-  size_t AllocSize = 0;
-  {
-    std::lock_guard<std::mutex> Lock(M);
-    auto I = Allocations.find(Base.toPtr<void *>());
-    if (I == Allocations.end())
-      return make_error<StringError>("Attempt to finalize unrecognized "
-                                     "allocation " +
-                                         formatv("{0:x}", Base.getValue()),
-                                     inconvertibleErrorCode());
-    AllocSize = I->second.Size;
-    I->second.DeallocationActions = std::move(DeallocationActions);
-  }
-  ExecutorAddr AllocEnd = Base + ExecutorAddrDiff(AllocSize);
-
-  // Bail-out function: this will run deallocation actions corresponding to any
-  // completed finalization actions, then deallocate memory.
-  auto BailOut = [&](Error Err) {
-    std::pair<void *, Allocation> AllocToDestroy;
-
-    // Get allocation to destroy.
-    {
-      std::lock_guard<std::mutex> Lock(M);
-      auto I = Allocations.find(Base.toPtr<void *>());
-
-      // Check for missing allocation (effective a double free).
-      if (I == Allocations.end())
-        return joinErrors(
-            std::move(Err),
-            make_error<StringError>("No allocation entry found "
-                                    "for " +
-                                        formatv("{0:x}", Base.getValue()),
-                                    inconvertibleErrorCode()));
-      AllocToDestroy = std::move(*I);
-      Allocations.erase(I);
-    }
+  ExecutorAddrRange RR(FR.Segments.front().Addr, FR.Segments.front().Addr);
 
-    // Run deallocation actions for all completed finalization actions.
-    while (SuccessfulFinalizationActions)
-      Err =
-          joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions]
-                                         .Dealloc.runWithSPSRetErrorMerged());
-
-    // Deallocate memory.
-    sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size);
-    if (auto EC = sys::Memory::releaseMappedMemory(MB))
-      Err = joinErrors(std::move(Err), errorCodeToError(EC));
-
-    return Err;
-  };
+  std::vector<sys::MemoryBlock> MBsToReset;
+  auto ResetMBs = make_scope_exit([&]() {
+    for (auto &MB : MBsToReset)
+      sys::Memory::protectMappedMemory(MB, sys::Memory::MF_READ |
+                                               sys::Memory::MF_WRITE);
+    sys::Memory::InvalidateInstructionCache(RR.Start.toPtr<void *>(),
+                                            RR.size());
+  });
 
   // Copy content and apply permissions.
   for (auto &Seg : FR.Segments) {
+    RR.Start = std::min(RR.Start, Seg.Addr);
+    RR.End = std::max(RR.End, Seg.Addr + Seg.Size);
 
     // Check segment ranges.
     if (LLVM_UNLIKELY(Seg.Size < Seg.Content.size()))
-      return BailOut(make_error<StringError>(
+      return make_error<StringError>(
           formatv("Segment {0:x} content size ({1:x} bytes) "
                   "exceeds segment size ({2:x} bytes)",
                   Seg.Addr.getValue(), Seg.Content.size(), Seg.Size),
-          inconvertibleErrorCode()));
+          inconvertibleErrorCode());
     ExecutorAddr SegEnd = Seg.Addr + ExecutorAddrDiff(Seg.Size);
-    if (LLVM_UNLIKELY(Seg.Addr < Base || SegEnd > AllocEnd))
-      return BailOut(make_error<StringError>(
+    if (LLVM_UNLIKELY(Seg.Addr < RR.Start || SegEnd > RR.End))
+      return make_error<StringError>(
           formatv("Segment {0:x} -- {1:x} crosses boundary of "
                   "allocation {2:x} -- {3:x}",
-                  Seg.Addr.getValue(), SegEnd.getValue(), Base.getValue(),
-                  AllocEnd.getValue()),
-          inconvertibleErrorCode()));
+                  Seg.Addr, SegEnd, RR.Start, RR.End),
+          inconvertibleErrorCode());
 
     char *Mem = Seg.Addr.toPtr<char *>();
     if (!Seg.Content.empty())
       memcpy(Mem, Seg.Content.data(), Seg.Content.size());
     memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
     assert(Seg.Size <= std::numeric_limits<size_t>::max());
+
+    sys::MemoryBlock MB(Mem, Seg.Size);
     if (auto EC = sys::Memory::protectMappedMemory(
-            {Mem, static_cast<size_t>(Seg.Size)},
-            toSysMemoryProtectionFlags(Seg.RAG.Prot)))
-      return BailOut(errorCodeToError(EC));
+            MB, toSysMemoryProtectionFlags(Seg.RAG.Prot)))
+      return errorCodeToError(EC);
+
+    MBsToReset.push_back(MB);
+
     if ((Seg.RAG.Prot & MemProt::Exec) == MemProt::Exec)
       sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
   }
 
-  // Run finalization actions.
-  for (auto &ActPair : FR.Actions) {
-    if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged())
-      return BailOut(std::move(Err));
-    ++SuccessfulFinalizationActions;
+  auto DeallocActions = runFinalizeActions(FR.Actions);
+  if (!DeallocActions)
+    return DeallocActions.takeError();
+
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    auto Region = createRegionInfo(RR, "In initialize");
+    if (!Region)
+      return Region.takeError();
+    Region->DeallocActions = std::move(*DeallocActions);
   }
 
-  return Error::success();
+  // Successful initialization.
+  ResetMBs.release();
+
+  return RR.Start;
 }
 
-Error SimpleExecutorMemoryManager::deallocate(
-    const std::vector<ExecutorAddr> &Bases) {
-  std::vector<std::pair<void *, Allocation>> AllocPairs;
-  AllocPairs.reserve(Bases.size());
+Error SimpleExecutorMemoryManager::deinitialize(
+    const std::vector<ExecutorAddr> &InitKeys) {
+  Error Err = Error::success();
 
-  // Get allocation to destroy.
+  for (auto &KeyAddr : llvm::reverse(InitKeys)) {
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
+    {
+      std::scoped_lock<std::mutex> Lock(M);
+      auto Slab = getSlabInfo(KeyAddr, "In deinitialize");
+      if (!Slab) {
+        Err = joinErrors(std::move(Err), Slab.takeError());
+        continue;
+      }
+
+      auto RI = getRegionInfo(*Slab, KeyAddr, "In deinitialize");
+      if (!RI) {
+        Err = joinErrors(std::move(Err), RI.takeError());
+        continue;
+      }
+
+      DeallocActions = std::move(RI->DeallocActions);
+    }
+
+    Err = joinErrors(std::move(Err),
+                     runDeallocActions(std::move(DeallocActions)));
+  }
+
+  return Err;
+}
+
+Error SimpleExecutorMemoryManager::release(
+    const std::vector<ExecutorAddr> &Bases) {
   Error Err = Error::success();
-  {
-    std::lock_guard<std::mutex> Lock(M);
-    for (auto &Base : Bases) {
-      auto I = Allocations.find(Base.toPtr<void *>());
-
-      // Check for missing allocation (effective a double free).
-      if (I != Allocations.end()) {
-        AllocPairs.push_back(std::move(*I));
-        Allocations.erase(I);
-      } else
+
+  // TODO: Prohibit new initializations within the slabs being removed?
+  for (auto &Base : llvm::reverse(Bases)) {
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
+    sys::MemoryBlock MB;
+
+    {
+      std::scoped_lock<std::mutex> Lock(M);
+
+      auto SlabI = Slabs.find(Base.toPtr<void *>());
+      if (SlabI == Slabs.end()) {
         Err = joinErrors(
             std::move(Err),
-            make_error<StringError>("No allocation entry found "
-                                    "for " +
-                                        formatv("{0:x}", Base.getValue()),
+            make_error<StringError>("In release, " + formatv("{0:x}", Base) +
+                                        " is not part of any reserved "
+                                        "address range",
                                     inconvertibleErrorCode()));
+        continue;
+      }
+
+      auto &Slab = SlabI->second;
+
+      for (auto &[Addr, Region] : Slab.Regions)
+        llvm::copy(Region.DeallocActions, back_inserter(DeallocActions));
+
+      MB = {Base.toPtr<void *>(), Slab.Size};
+
+      Slabs.erase(SlabI);
     }
-  }
 
-  while (!AllocPairs.empty()) {
-    auto &P = AllocPairs.back();
-    Err = joinErrors(std::move(Err), deallocateImpl(P.first, P.second));
-    AllocPairs.pop_back();
+    Err = joinErrors(std::move(Err), runDeallocActions(DeallocActions));
+    if (auto EC = sys::Memory::releaseMappedMemory(MB))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
   }
 
   return Err;
@@ -185,16 +187,15 @@ Error SimpleExecutorMemoryManager::deallocate(
 
 Error SimpleExecutorMemoryManager::shutdown() {
 
-  AllocationsMap AM;
+  // TODO: Prevent new allocations during shutdown.
+  std::vector<ExecutorAddr> Bases;
   {
-    std::lock_guard<std::mutex> Lock(M);
-    AM = std::move(Allocations);
+    std::scoped_lock<std::mutex> Lock(M);
+    for (auto &[Base, Slab] : Slabs)
+      Bases.push_back(ExecutorAddr::fromPtr(Base));
   }
 
-  Error Err = Error::success();
-  for (auto &KV : AM)
-    Err = joinErrors(std::move(Err), deallocateImpl(KV.first, KV.second));
-  return Err;
+  return release(Bases);
 }
 
 void SimpleExecutorMemoryManager::addBootstrapSymbols(
@@ -202,58 +203,150 @@ void SimpleExecutorMemoryManager::addBootstrapSymbols(
   M[rt::SimpleExecutorMemoryManagerInstanceName] = ExecutorAddr::fromPtr(this);
   M[rt::SimpleExecutorMemoryManagerReserveWrapperName] =
       ExecutorAddr::fromPtr(&reserveWrapper);
-  M[rt::SimpleExecutorMemoryManagerFinalizeWrapperName] =
-      ExecutorAddr::fromPtr(&finalizeWrapper);
-  M[rt::SimpleExecutorMemoryManagerDeallocateWrapperName] =
-      ExecutorAddr::fromPtr(&deallocateWrapper);
+  M[rt::SimpleExecutorMemoryManagerInitializeWrapperName] =
+      ExecutorAddr::fromPtr(&initializeWrapper);
+  M[rt::SimpleExecutorMemoryManagerDeinitializeWrapperName] =
+      ExecutorAddr::fromPtr(&deinitializeWrapper);
+  M[rt::SimpleExecutorMemoryManagerReleaseWrapperName] =
+      ExecutorAddr::fromPtr(&releaseWrapper);
 }
 
-Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) {
-  Error Err = Error::success();
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddr A, StringRef Context) {
+  auto MakeBadSlabError = [&]() {
+    return make_error<StringError>(
+        Context + ", address " + formatv("{0:x}", A) +
+            " is not part of any reserved address range",
+        inconvertibleErrorCode());
+  };
 
-  while (!A.DeallocationActions.empty()) {
-    Err = joinErrors(std::move(Err),
-                     A.DeallocationActions.back().runWithSPSRetErrorMerged());
-    A.DeallocationActions.pop_back();
+  auto I = Slabs.upper_bound(A.toPtr<void *>());
+  if (I == Slabs.begin())
+    return MakeBadSlabError();
+  --I;
+  if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+           .contains(A))
+    return MakeBadSlabError();
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddrRange R,
+                                         StringRef Context) {
+  auto MakeBadSlabError = [&]() {
+    return make_error<StringError>(
+        Context + ", range " + formatv("{0:x}", R) +
+            " is not part of any reserved address range",
+        inconvertibleErrorCode());
+  };
+
+  auto I = Slabs.upper_bound(R.Start.toPtr<void *>());
+  if (I == Slabs.begin())
+    return MakeBadSlabError();
+  --I;
+  if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+           .contains(R))
+    return MakeBadSlabError();
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::createRegionInfo(ExecutorAddrRange R,
+                                              StringRef Context) {
+
+  auto Slab = getSlabInfo(R, Context);
+  if (!Slab)
+    return Slab.takeError();
+
+  auto MakeBadRegionError = [&](ExecutorAddrRange Other, bool Prev) {
+    return make_error<StringError>(Context + ", region " + formatv("{0:x}", R) +
+                                       " overlaps " +
+                                       (Prev ? "previous" : "following") +
+                                       " region " + formatv("{0:x}", Other),
+                                   inconvertibleErrorCode());
+  };
+
+  auto I = Slab->Regions.upper_bound(R.Start);
+  if (I != Slab->Regions.begin()) {
+    auto J = std::prev(I);
+    ExecutorAddrRange PrevRange(J->first, J->second.Size);
+    if (PrevRange.overlaps(R))
+      return MakeBadRegionError(PrevRange, true);
+  }
+  if (I != Slab->Regions.end()) {
+    ExecutorAddrRange NextRange(I->first, I->second.Size);
+    if (NextRange.overlaps(R))
+      return MakeBadRegionError(NextRange, false);
   }
 
-  sys::MemoryBlock MB(Base, A.Size);
-  if (auto EC = sys::Memory::releaseMappedMemory(MB))
-    Err = joinErrors(std::move(Err), errorCodeToError(EC));
+  auto &RInfo = Slab->Regions[R.Start];
+  RInfo.Size = R.size();
+  return RInfo;
+}
 
-  return Err;
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(SlabInfo &Slab, ExecutorAddr A,
+                                           StringRef Context) {
+  auto I = Slab.Regions.find(A);
+  if (I == Slab.Regions.end())
+    return make_error<StringError>(
+        Context + ", address " + formatv("{0:x}", A) +
+            " does not correspond to the start of any initialized region",
+        inconvertibleErrorCode());
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(ExecutorAddr A, StringRef Context) {
+  auto Slab = getSlabInfo(A, Context);
+  if (!Slab)
+    return Slab.takeError();
+
+  return getRegionInfo(*Slab, A, Context);
 }
 
 llvm::orc::shared::CWrapperFunctionResult
 SimpleExecutorMemoryManager::reserveWrapper(const char *ArgData,
                                             size_t ArgSize) {
-  return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerReserveSignature>::
+  return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReserveSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::allocate))
+                 &SimpleExecutorMemoryManager::reserve))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorMemoryManager::initializeWrapper(const char *ArgData,
+                                               size_t ArgSize) {
+  return shared::
+      WrapperFunction<rt::SPSSimpleRemoteMemoryMapInitializeSignature>::handle(
+             ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorMemoryManager::initialize))
           .release();
 }
 
 llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::finalizeWrapper(const char *ArgData,
-                                             size_t ArgSize) {
+SimpleExecutorMemoryManager::deinitializeWrapper(const char *ArgData,
+                                                 size_t ArgSize) {
   return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+             rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::finalize))
+                 &SimpleExecutorMemoryManager::deinitialize))
           .release();
 }
 
 llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::deallocateWrapper(const char *ArgData,
-                                               size_t ArgSize) {
-  return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+SimpleExecutorMemoryManager::releaseWrapper(const char *ArgData,
+                                            size_t ArgSize) {
+  return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReleaseSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::deallocate))
+                 &SimpleExecutorMemoryManager::release))
           .release();
 }
 
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 228114c..44c4830 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -57,6 +57,7 @@ def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
 def SplitDoubleTy : DXILOpParamType;
 def BinaryWithCarryTy : DXILOpParamType;
+def DimensionsTy : DXILOpParamType;
 
 class DXILOpClass;
 
@@ -901,6 +902,13 @@ def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> {
   let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
 }
 
+def GetDimensions : DXILOp<72, getDimensions> {
+  let Doc = "gets the dimensions of a buffer or texture";
+  let arguments = [HandleTy, Int32Ty];
+  let result = DimensionsTy;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def Barrier : DXILOp<80, barrier> {
   let Doc = "inserts a memory barrier in the shader";
   let intrinsics = [
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 1aed8f9..944b2e6 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -261,6 +261,12 @@ static StructType *getBinaryWithCarryType(LLVMContext &Context) {
   return StructType::create({Int32Ty, Int1Ty}, "dx.types.i32c");
 }
 
+static StructType *getDimensionsType(LLVMContext &Ctx) {
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  return getOrCreateStructType("dx.types.Dimensions",
+                               {Int32Ty, Int32Ty, Int32Ty, Int32Ty}, Ctx);
+}
+
 static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
                                     Type *OverloadTy) {
   switch (Kind) {
@@ -318,6 +324,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getSplitDoubleType(Ctx);
   case OpParamType::BinaryWithCarryTy:
     return getBinaryWithCarryType(Ctx);
+  case OpParamType::DimensionsTy:
+    return getDimensionsType(Ctx);
   }
   llvm_unreachable("Invalid parameter kind");
   return nullptr;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 610d8b6..e46a393 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -627,6 +627,28 @@ public:
     });
   }
 
+  [[nodiscard]] bool lowerGetDimensionsX(Function &F) {
+    IRBuilder<> &IRB = OpBuilder.getIRB();
+    Type *Int32Ty = IRB.getInt32Ty();
+
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      IRB.SetInsertPoint(CI);
+      Value *Handle =
+          createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+      Value *Undef = UndefValue::get(Int32Ty);
+
+      Expected<CallInst *> OpCall = OpBuilder.tryCreateOp(
+          OpCode::GetDimensions, {Handle, Undef}, CI->getName(), Int32Ty);
+      if (Error E = OpCall.takeError())
+        return E;
+      Value *Dim = IRB.CreateExtractValue(*OpCall, 0);
+
+      CI->replaceAllUsesWith(Dim);
+      CI->eraseFromParent();
+      return Error::success();
+    });
+  }
+
   [[nodiscard]] bool lowerGetPointer(Function &F) {
     // These should have already been handled in DXILResourceAccess, so we can
     // just clean up the dead prototype.
@@ -934,6 +956,9 @@ public:
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
+      case Intrinsic::dx_resource_getdimensions_x:
+        HasErrors |= lowerGetDimensionsX(F);
+        break;
       case Intrinsic::ctpop:
         HasErrors |= lowerCtpopToCountBits(F);
         break;
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 5f180d6..3bd6ed4 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -66,6 +66,10 @@ public:
 
   void remapInstruction(MCInst &Instr) const;
 
+  Expected<bool> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+                               ArrayRef<uint8_t> Bytes,
+                               uint64_t Address) const override;
+
 private:
   bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
                   uint64_t &BytesToSkip, raw_ostream &CS) const;
@@ -567,6 +571,18 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
   return Result;
 }
 
+Expected<bool> HexagonDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
+                                                  uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address) const {
+  // At the start of a symbol, force a fresh packet by resetting any
+  // in-progress bundle state. This prevents packets from straddling label
+  // boundaries when data (e.g. jump tables) appears in between.
+  Size = 0;
+  resetBundle();
+  return true;
+}
+
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
                                         ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 662d3f6..b1794b7 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -717,6 +717,18 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .clampScalar(0, sXLen, sXLen)
       .lower();
 
+  LegalityPredicate InsertVectorEltPred = [=](const LegalityQuery &Query) {
+    LLT VecTy = Query.Types[0];
+    LLT EltTy = Query.Types[1];
+    return VecTy.getElementType() == EltTy;
+  };
+
+  getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
+      .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                   InsertVectorEltPred, typeIs(2, sXLen)))
+      .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), InsertVectorEltPred,
+                   typeIs(2, sXLen)));
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 0f6e1ca..eedfdb3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1771,6 +1771,10 @@ defm RELAXED_DOT_ADD :
             "i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
             "i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
 
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
+
 //===----------------------------------------------------------------------===//
 // Relaxed BFloat16 dot product
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index d99f1eb..ddb99a5 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -75,8 +75,6 @@ static cl::opt<bool>
                                "expressed as branches by widenable conditions"),
                       cl::init(true));
 
-namespace {
-
 // Get the condition of \p I. It can either be a guard or a conditional branch.
 static Value *getCondition(Instruction *I) {
   if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
@@ -130,6 +128,8 @@ findInsertionPointForWideCondition(Instruction *WCOrGuard) {
   return std::nullopt;
 }
 
+namespace {
+
 class GuardWideningImpl {
   DominatorTree &DT;
   PostDominatorTree *PDT;
@@ -328,7 +328,7 @@ public:
   /// The entry point for this pass.
   bool run();
 };
-}
+} // namespace
 
 static bool isSupportedGuardInstruction(const Instruction *Insn) {
   if (isGuard(Insn))
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 3c14036e..6fb8197 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -26,8 +26,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-
 static cl::opt<unsigned>
     JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
                            cl::desc("Only split jump tables with size less or "
@@ -43,8 +41,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
              "or equal than this threshold."),
     cl::init(50));
 
+namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
 } // end namespace llvm
 
 #define DEBUG_TYPE "jump-table-to-switch"
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9655173..b2c526b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -116,8 +116,6 @@ STATISTIC(NumIntAssociationsHoisted,
 STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
                                     "reassociated and hoisted out of the loop");
 
-namespace llvm {
-
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
     DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
@@ -156,7 +154,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit(
 // which may not be precise, since optimizeUses is capped. The result is
 // correct, but we may not get as "far up" as possible to get which access is
 // clobbering the one queried.
-cl::opt<unsigned> SetLicmMssaOptCap(
+cl::opt<unsigned> llvm::SetLicmMssaOptCap(
     "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
     cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
              "for faster compile. Caps the MemorySSA clobbering calls."));
@@ -164,15 +162,15 @@ cl::opt<unsigned> SetLicmMssaOptCap(
 // Experimentally, memory promotion carries less importance than sinking and
 // hoisting. Limit when we do promotion when using MemorySSA, in order to save
 // compile time.
-cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap(
+cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
     "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
     cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
              "effect. When MSSA in LICM is enabled, then this is the maximum "
              "number of accesses allowed to be present in a loop in order to "
              "enable memory promotion."));
 
+namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
 } // end namespace llvm
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
@@ -1120,11 +1118,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
   return false;
 }
 
-namespace {
 /// Return true if-and-only-if we know how to (mechanically) both hoist and
 /// sink a given instruction out of a loop.  Does not address legality
 /// concerns such as aliasing or speculation safety.
-bool isHoistableAndSinkableInst(Instruction &I) {
+static bool isHoistableAndSinkableInst(Instruction &I) {
   // Only these instructions are hoistable/sinkable.
   return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
           isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
@@ -1136,8 +1133,8 @@ bool isHoistableAndSinkableInst(Instruction &I) {
 }
 
 /// Return true if I is the only Instruction with a MemoryAccess in L.
-bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
-                        const MemorySSAUpdater &MSSAU) {
+static bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
+                               const MemorySSAUpdater &MSSAU) {
   for (auto *BB : L->getBlocks())
     if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) {
       int NotAPhi = 0;
@@ -1151,7 +1148,6 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
     }
   return true;
 }
-}
 
 static MemoryAccess *getClobberingMemoryAccess(MemorySSA &MSSA,
                                                BatchAAResults &BAA,
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 73f1942..7706de8 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -21,8 +21,7 @@
 
 #define DEBUG_TYPE "loop-bound-split"
 
-namespace llvm {
-
+using namespace llvm;
 using namespace PatternMatch;
 
 namespace {
@@ -358,8 +357,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
   IRBuilder<> Builder(&PostLoopPreHeader->front());
 
   // Update phi nodes in header of post-loop.
-  bool isExitingLatch =
-      (L.getExitingBlock() == L.getLoopLatch()) ? true : false;
+  bool isExitingLatch = L.getExitingBlock() == L.getLoopLatch();
   Value *ExitingCondLCSSAPhi = nullptr;
   for (PHINode &PN : L.getHeader()->phis()) {
     // Create LCSSA phi node in preheader of post-loop.
@@ -472,8 +470,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
 PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
                                           LoopStandardAnalysisResults &AR,
                                           LPMUpdater &U) {
-  Function &F = *L.getHeader()->getParent();
-  (void)F;
+  [[maybe_unused]] Function &F = *L.getHeader()->getParent();
 
   LLVM_DEBUG(dbgs() << "Spliting bound of loop in " << F.getName() << ": " << L
                     << "\n");
@@ -486,5 +483,3 @@ PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
 
   return getLoopPassPreservedAnalyses();
 }
-
-} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 26b9d99..8705647 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -206,7 +206,7 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
 
 ; global nnan function attribute always forces clamp combine
 
-define float @test_min_max_global_nnan(float %a) #3 {
+define float @test_min_max_global_nnan(float %a) {
 ; GFX10-LABEL: test_min_max_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -223,11 +223,11 @@ define float @test_min_max_global_nnan(float %a) #3 {
 ; GFX12-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
-  %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0)
+  %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 1.0)
   ret float %fmed
 }
 
-define float @test_max_min_global_nnan(float %a) #3 {
+define float @test_max_min_global_nnan(float %a) {
 ; GFX10-LABEL: test_max_min_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,7 +244,7 @@ define float @test_max_min_global_nnan(float %a) #3 {
 ; GFX12-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 1.0)
-  %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
+  %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 0.0)
   ret float %fmed
 }
 
@@ -414,5 +414,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
 attributes #0 = {"amdgpu-ieee"="true"}
 attributes #1 = {"amdgpu-ieee"="false"}
 attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"}
-attributes #3 = {"no-nans-fp-math"="true"}
 attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index d2c93e7..696a87b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -232,7 +232,7 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
 
 ; global nnan function attribute always forces fmed3 combine
 
-define float @test_min_max_global_nnan(float %a) #2 {
+define float @test_min_max_global_nnan(float %a) {
 ; GFX10-LABEL: test_min_max_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -254,12 +254,12 @@ define float @test_min_max_global_nnan(float %a) #2 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
+  %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
   %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
   ret float %fmed
 }
 
-define float @test_max_min_global_nnan(float %a) #2 {
+define float @test_max_min_global_nnan(float %a) {
 ; GFX10-LABEL: test_max_min_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -281,8 +281,8 @@ define float @test_max_min_global_nnan(float %a) #2 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
-  %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
+  %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
+  %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
   ret float %fmed
 }
 
@@ -560,4 +560,3 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
 attributes #0 = {"amdgpu-ieee"="true"}
 attributes #1 = {"amdgpu-ieee"="false"}
-attributes #2 = {"no-nans-fp-math"="true"}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9e15225..3145a27 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -10,7 +10,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
 
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -120,7 +120,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -342,7 +342,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -453,7 +453,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -569,7 +569,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -740,7 +740,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -955,14 +955,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
   %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
   %a = load float, ptr addrspace(1) %gep0
 
-  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
-  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  %max = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
+  %med = call nnan float @llvm.minnum.f32(float %max, float 4.0)
 
   store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
-define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1297,10 +1297,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
-  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -1487,10 +1487,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %b.fneg = fsub float -0.0, %b
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b.fneg)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b.fneg)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -1677,10 +1677,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %c.fneg = fsub float -0.0, %c
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fneg)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -1872,14 +1872,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
   %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.fneg = fsub float -0.0, %a
-  %b.fabs = call float @llvm.fabs.f32(float %b)
-  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %b.fabs = call nnan float @llvm.fabs.f32(float %b)
+  %c.fabs = call nnan float @llvm.fabs.f32(float %c)
   %c.fabs.fneg = fsub float -0.0, %c.fabs
 
-  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
-  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
 
   store float %med3, ptr addrspace(1) %outgep
   ret void
@@ -2082,16 +2082,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
   %c.fabs = call float @llvm.fabs.f32(float %c)
   %c.fabs.fneg = fsub float -0.0, %c.fabs
 
-  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
-  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
 
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2266,7 +2266,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2418,7 +2418,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2570,7 +2570,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
   ret void
 }
 
-define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2878,10 +2878,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3030,10 +3030,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3220,10 +3220,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
-  %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
-  %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b)
-  %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+  %tmp1 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3372,10 +3372,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3524,10 +3524,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3676,10 +3676,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3828,10 +3828,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3980,10 +3980,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4132,10 +4132,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4284,10 +4284,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4436,10 +4436,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4588,10 +4588,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4740,10 +4740,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4892,10 +4892,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5044,10 +5044,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5196,10 +5196,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5348,10 +5348,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5503,10 +5503,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5515,7 +5515,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
 ; Negative patterns
 ; ---------------------------------------------------------------------
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5717,7 +5717,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5944,7 +5944,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6146,7 +6146,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6352,7 +6352,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6527,7 +6527,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6702,7 +6702,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6877,7 +6877,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7270,10 +7270,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
-  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -7428,13 +7428,13 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %max = call float @llvm.maxnum.f32(float %a, float %b)
-  %minmax = call float @llvm.minnum.f32(float %max, float %c)
+  %max = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %minmax = call nnan float @llvm.minnum.f32(float %max, float %c)
   store float %minmax, ptr addrspace(1) %outgep
   ret void
 }
 
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7597,7 +7597,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7865,7 +7865,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
   ret void
 }
 
-define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: two_non_inline_constant:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7998,7 +7998,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 }
 
 ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
-define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: one_non_inline_constant:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8137,7 +8137,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: two_non_inline_constant_multi_use:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8343,7 +8343,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
   ret void
 }
 
-define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
 ; SI-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8384,7 +8384,7 @@ define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
   ret float %med
 }
 
-define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) #1 {
+define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8452,7 +8452,7 @@ define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %
   ret <2 x float> %med
 }
 
-define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) #1 {
+define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8525,7 +8525,7 @@ define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(
   ret { float, float } %ins.1
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8567,7 +8567,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a)
   ret float %med
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8609,7 +8609,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a)
   ret float %med
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8651,7 +8651,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
   ret float %med
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8693,7 +8693,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
   ret float %med
 }
 
-define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
+define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8772,7 +8772,7 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
   ret half %med
 }
 
-define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) #1 {
+define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8848,7 +8848,7 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
   ret <2 x half> %med
 }
 
-define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) #1 {
+define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) {
 ; SI-LABEL: v_test_fmed3_r_i_i_f64_minimumnum_maximumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8905,5 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
 declare half @llvm.maxnum.f16(half, half) #0
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 56f9c5d..d578d2e 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -612,10 +612,10 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
 ; GFX1250-NEXT:    v_med3_num_f32 v2, v2, v3, v4
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-  %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
-  %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
-  %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %x, float %y)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %x, float %y)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %z)
+  %tmp3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %tmp3, ptr addrspace(1) %arg
   ret void
 }
@@ -646,10 +646,10 @@ define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x
 ; GFX1250-NEXT:    v_med3_num_f32 v2, v2, v3, v4
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-  %tmp0 = call float @llvm.minimumnum.f32(float %x, float %y)
-  %tmp1 = call float @llvm.maximumnum.f32(float %x, float %y)
-  %tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z)
-  %tmp3 = call float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minimumnum.f32(float %x, float %y)
+  %tmp1 = call nnan float @llvm.maximumnum.f32(float %x, float %y)
+  %tmp2 = call nnan float @llvm.minimumnum.f32(float %tmp1, float %z)
+  %tmp3 = call nnan float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
   store float %tmp3, ptr addrspace(1) %arg
   ret void
 }
@@ -1280,10 +1280,10 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
 ; GISEL-GFX1250-FAKE16-NEXT:    v_med3_num_f16 v2, v2, v3, v4
 ; GISEL-GFX1250-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GISEL-GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
-  %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
-  %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
-  %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
-  %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+  %tmp0 = call nnan half @llvm.minnum.f16(half %x, half %y)
+  %tmp1 = call nnan half @llvm.maxnum.f16(half %x, half %y)
+  %tmp2 = call nnan half @llvm.minnum.f16(half %tmp1, half %z)
+  %tmp3 = call nnan half @llvm.maxnum.f16(half %tmp0, half %tmp2)
   store half %tmp3, ptr addrspace(1) %arg
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll
new file mode 100644
index 0000000..ff03bf1
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define i32 @test_getdimensions_no_mips() {
+  ; CHECK: %[[HANDLE:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, 
+  ; CHECK-NEXT: %[[ANNOT_HANDLE:.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[HANDLE]]
+  %handle = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NEXT: %[[RETVAL:.*]] = call %dx.types.Dimensions @dx.op.getDimensions(i32 72, %dx.types.Handle %[[ANNOT_HANDLE]], i32 undef)
+  ; CHECK-NEXT: %[[DIM:.*]] = extractvalue %dx.types.Dimensions %[[RETVAL]], 0
+  %1 = call i32 @llvm.dx.resource.getdimensions.x(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %handle)
+  
+  ; CHECK-NEXT: ret i32 %[[DIM]]
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-many-stores.mir b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
new file mode 100644
index 0000000..bf14dcf
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
@@ -0,0 +1,88 @@
+# RUN: llc -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null -pipeliner-max-num-stores=5 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# This loop has six stores, which exceeds the limit set by
+# `pipeliner-max-num-stores`.
+
+# CHECK: Too many stores
+
+--- |
+  target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+  target triple = "hexagon-unknown-linux-musl"
+  
+  define void @f(ptr %a, i32 %n) #0 {
+  entry:
+    %guard = icmp sgt i32 %n, 0
+    %btc = sub nsw i32 %n, 1
+    br i1 %guard, label %loop.preheader, label %exit
+  
+  loop.preheader:                                   ; preds = %entry
+    %0 = add i32 %n, 1
+    %cgep = getelementptr i8, ptr %a, i32 %0
+    br label %loop
+  
+  loop:                                             ; preds = %loop.preheader, %loop
+    %lsr.iv = phi ptr [ %cgep, %loop.preheader ], [ %cgep8, %loop ]
+    %i = phi i32 [ %i.dec, %loop ], [ %btc, %loop.preheader ]
+    %cgep7 = getelementptr i8, ptr %lsr.iv, i32 -2
+    store i8 0, ptr %cgep7, align 1
+    %cgep8 = getelementptr i8, ptr %lsr.iv, i32 -1
+    store i8 1, ptr %cgep8, align 1
+    store i8 2, ptr %lsr.iv, align 1
+    %cgep9 = getelementptr i8, ptr %lsr.iv, i32 1
+    store i8 3, ptr %cgep9, align 1
+    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 2
+    store i8 4, ptr %cgep10, align 1
+    %cgep11 = getelementptr i8, ptr %lsr.iv, i32 3
+    store i8 5, ptr %cgep11, align 1
+    %i.dec = sub i32 %i, 1
+    %ec = icmp eq i32 %i.dec, 0
+    br i1 %ec, label %exit, label %loop
+  
+  exit:                                             ; preds = %loop, %entry
+    ret void
+  }
+  
+  attributes #0 = { "target-cpu"="hexagonv79" }
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $r0, $r1
+  
+    %7:intregs = COPY $r1
+    %6:intregs = COPY $r0
+    %8:predregs = C2_cmpgti %7, 0
+    J2_jumpf %8, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.loop.preheader:
+    successors: %bb.2(0x80000000)
+  
+    %0:intregs = A2_addi %7, -1
+    %1:intregs = S4_addaddi %7, %6, 1
+    %10:intregs = A2_tfrsi 0
+    %11:intregs = A2_tfrsi 1
+    %14:intregs = COPY %0
+    J2_loop0r %bb.2, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2.loop (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %2:intregs = PHI %1, %bb.1, %4, %bb.2
+    S2_storerb_io %2, -2, %10 :: (store (s8) into %ir.cgep7)
+    %4:intregs = A2_addi %2, -1
+    S2_storerb_io %2, -1, %11 :: (store (s8) into %ir.cgep8)
+    S4_storeirb_io %2, 0, 2 :: (store (s8) into %ir.lsr.iv)
+    S4_storeirb_io %2, 1, 3 :: (store (s8) into %ir.cgep9)
+    S4_storeirb_io %2, 2, 4 :: (store (s8) into %ir.cgep10)
+    S4_storeirb_io %2, 3, 5 :: (store (s8) into %ir.cgep11)
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
new file mode 100644
index 0000000..e67d031
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation.
+
+; Function for the vector type v2i64 `a + {1, 1}`
+define <2 x i64> @test_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltisw v3, 1
+; CHECK-NEXT:    vupklsw v3, v3
+; CHECK-NEXT:    vaddudm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <2 x i64> %a, splat (i64 1)
+  ret <2 x i64> %add
+}
+
+; Function for the vector type v4i32 `a + {1, 1, 1, 1}`
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltisw v3, 1
+; CHECK-NEXT:    vadduwm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <4 x i32> %a, splat (i32 1)
+  ret <4 x i32> %add
+}
+
+; Function for the vector type v8i16 `a + {1, 1, 1, 1, 1, 1, 1, 1}`
+define <8 x i16> @test_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltish v3, 1
+; CHECK-NEXT:    vadduhm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <8 x i16> %a, splat (i16 1)
+  ret <8 x i16> %add
+}
+
+; Function for the vector type v16i8 `a + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}`
+define <16 x i8> @test_16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v3, 1
+; CHECK-NEXT:    vaddubm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <16 x i8> %a, splat (i8 1)
+  ret <16 x i8> %add
+}
diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
deleted file mode 100644
index e4c93adc..0000000
--- a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
-; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
-; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation.
-; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s
-; followed by subtraction operation.
-define dso_local noundef <4 x i32> @test1(<4 x i32> %a) {
-; CHECK-LABEL: test1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltisw v3, 1
-; CHECK-NEXT:    vadduwm v2, v2, v3
-; CHECK-NEXT:    blr
-entry:
-  %add = add <4 x i32> %a, splat (i32 1)
-  ret <4 x i32> %add
-}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index 2e500d5..da7546e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -689,8 +689,8 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_EXTRACT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir
new file mode 100644
index 0000000..d7c0e80
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir
@@ -0,0 +1,1742 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            insertelement_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv1i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 1
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv2i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 2
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s32)
+    %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s32)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv8i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 15
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv16i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_3
+    ; CHECK: liveins: $v0, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(<vscale x 4 x s1>) = COPY $v0
+    %2:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s32)
+    $v0 = COPY %3(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv16i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11, $x12
+
+    ; CHECK-LABEL: name: insertelement_nxv16i8_2
+    ; CHECK: liveins: $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %2:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $x11
+    %4:_(s32) = COPY $x12
+    %1:_(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %7:_(s32) = G_TRUNC %1(s64)
+    %5:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %6, %0(s8), %7(s32)
+    $v8m2 = COPY %5(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i8_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_3
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %2:_(s32) = COPY $x10
+    %1:_(s8) = G_TRUNC %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s32)
+    $v8 = COPY %3(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 1
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %2:_(s32) = COPY $x10
+    %1:_(s16) = G_TRUNC %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s32)
+    $v8 = COPY %3(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(s32) = COPY $x10
+    %3:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %3(s32)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv1i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv1i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8 = COPY %3(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv2i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8m2 = COPY %3(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv4i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8m4 = COPY %3(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv8i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8m8 = COPY %3(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv4i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11, $v8m4
+
+    ; CHECK-LABEL: name: insertelement_nxv4i64
+    ; CHECK: liveins: $x10, $x11, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s64>) = COPY $v8m4
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %1:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 0
+    %4:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %0, %1(s64), %5(s32)
+    $v8m4 = COPY %4(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir
new file mode 100644
index 0000000..4c33ddc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir
@@ -0,0 +1,1731 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            insertelement_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv1i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 1
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv2i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 2
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s64)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv8i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 15
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv16i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_3
+    ; CHECK: liveins: $v0, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(<vscale x 4 x s1>) = COPY $v0
+    %2:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s64)
+    $v0 = COPY %3(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv16i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv16i8_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %2:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s64)
+    %1:_(s64) = COPY $x11
+    %4:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %4, %0(s8), %1(s64)
+    $v8m2 = COPY %3(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i8_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_3
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %2:_(s64) = COPY $x10
+    %1:_(s8) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s64)
+    $v8 = COPY %3(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 1
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %2:_(s64) = COPY $x10
+    %1:_(s16) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s64)
+    $v8 = COPY %3(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8 = COPY %2(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8 = COPY %2(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8m4 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8m8 = COPY %2(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %2:_(s64) = COPY $x10
+    %1:_(s32) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %4(s64)
+    $v8m2 = COPY %3(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv1i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 04a2268..314e1b4 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
 ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
 
 target triple = "wasm32"
 
@@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -47,6 +52,109 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.mul
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_s16:
 ; CHECK:    i32x4.load16x4_s 0:p2align=1
@@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp6.not = icmp eq i32 %N, 0
   br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
@@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
+
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp6.not = icmp eq i32 %N, 0
   br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
index 279d4e8..83623fd 100644
--- a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;CHECK-LABEL: @foo(
 ;CHECK: icmp eq <4 x i32>
 ;CHECK: select <4 x i1>
-;CHECK: ret i32
-define i32 @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
+;CHECK: ret void
+define void @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
 entry:
   %cmp10 = icmp sgt i32 %x, 0
   br i1 %cmp10, label %for.body, label %for.end
@@ -35,5 +35,5 @@ if.end:                                           ; preds = %for.body, %if.then
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %if.end, %entry
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e..d0c1194 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -36,7 +36,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %exit, label %for.body
 }
 
-define i32 @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
+define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
 
 ; CHECK-COST-2: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.032
@@ -70,7 +70,7 @@ for.cond.cleanup.loopexit:                        ; preds = %if.end
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret i32 undef
+  ret void
 
 for.body:                                         ; preds = %for.body.preheader, %if.end
   %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
index 9e20586..44fb8cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK-LABEL: @read_mod_write_single_ptr(
 ; CHECK: load <8 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -23,15 +23,15 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
 
 ; CHECK-LABEL: @read_mod_i64(
 ; SLOWMEM32: load <2 x i64>
 ; FASTMEM32: load <4 x i64>
-; CHECK: ret i32
-define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -47,6 +47,6 @@ define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 6d2cda4..0287645 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -37,7 +37,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 3, [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -58,7 +58,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -73,11 +73,11 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
   %1 = icmp sgt i32 %n, 3
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -93,10 +93,10 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
-define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -136,7 +136,7 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -152,11 +152,11 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
   %1 = icmp sgt i32 %n, 9
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -173,5 +173,5 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index af5c921..fa3b4a66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux"
 ;CHECK-LABEL: func1x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
-define i32 @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
 entry:
   br label %for.body
 
@@ -40,14 +40,14 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  ret i32 undef
+  ret void
 }
 
 ; We are vectorizing with 12 runtime checks.
 ;CHECK-LABEL: func2x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
-define i32 @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
 entry:
   br label %for.body
 
@@ -85,5 +85,5 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
index 8971dfe..47355e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK-NOUNRL: store <4 x i32>
 ;CHECK-NOUNRL-NOT: store <4 x i32>
 ;CHECK-NOUNRL: ret
-define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
+define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -27,5 +27,5 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index f64255f..b7aa958 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; When scalarizing stores we need to preserve the original order.
 ; Make sure that we are extracting in the correct order (0101, and not 0011).
 
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
@@ -39,7 +39,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -55,7 +55,7 @@ for.body:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 1588d02..51255b2 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -73,7 +73,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp26 = icmp sgt i32 %n, 0
@@ -106,11 +106,11 @@ if.end14:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 ; As above but with multiple variables set per block.
-define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-LABEL: @multi_variable_if_nest(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -188,7 +188,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp26 = icmp sgt i32 %n, 0
@@ -224,5 +224,5 @@ if.end14:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
index 8a7f4a3..a88a9b14 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
@@ -17,8 +17,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;  }
 ;}
 
-define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
-; CHECK-LABEL: define i32 @function0(
+define void @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
+; CHECK-LABEL: define void @function0(
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[START:%.*]], i32 [[END:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP16:%.*]] = icmp slt i32 [[START]], [[END]]
@@ -94,7 +94,7 @@ define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end)
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp16 = icmp slt i32 %start, %end
@@ -127,7 +127,7 @@ if.end:
   br i1 %cmp, label %for.body, label %for.end
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 
@@ -237,6 +237,8 @@ for.end:                                          ; preds = %for.inc, %entry
 ; Handle PHI with single incoming value having a full mask.
 ; PR34523
 
+; NOTE: Changing PHI inputs from undef to poison leads to change in
+; behaviour of the test. Left as undef for now.
 define void @PR34523() {
 ; CHECK-LABEL: define void @PR34523() {
 ; CHECK-NEXT:  [[BB1:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 742ee64..eea2237 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -337,7 +337,7 @@ for.end:                                          ; preds = %for.body
 ;    }
 ;  }
 
-define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -429,7 +429,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK:       for.end10.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END10]]
 ; CHECK:       for.end10:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
@@ -469,12 +469,12 @@ for.inc8:                                         ; preds = %for.body3, %for.con
   br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
 
 for.end10:                                        ; preds = %for.inc8, %entry
-  ret i32 undef
+  ret void
 }
 
 ; second uniform store to the same address is conditional.
 ; we do not vectorize this.
-define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores_conditional(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -520,7 +520,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu
 ; CHECK:       for.end10.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END10]]
 ; CHECK:       for.end10:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
@@ -567,7 +567,7 @@ for.inc8:                                         ; preds = %for.body3, %for.con
   br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
 
 for.end10:                                        ; preds = %for.inc8, %entry
-  ret i32 undef
+  ret void
 }
 
 ; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
index b891b43..d9d9eec 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -132,7 +132,7 @@ for.end:
 ; CHECK-LABEL: @f6
 ; CHECK-NOT: <2 x i32>
 
-define i32 @f6(ptr %a, i32 %tmp) {
+define void @f6(ptr %a, i32 %tmp) {
 entry:
   br label %for.body
 
@@ -149,7 +149,7 @@ for.body:
   br i1 %exitcond, label %for.body, label %for.end
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 ; Don't vectorize true loop carried dependencies that are not a multiple of the
diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
index d700d48..f5e480c 100644
--- a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
+++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -10,7 +10,7 @@
 ; CHECK: store i64 %indvars.outer, ptr %O2, align 4
 
 
-define i64 @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
@@ -50,5 +50,5 @@ for.end.outer.loopexit:                           ; preds = %for.end.inner
   br label %for.end.outer
 
 for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
-  ret i64 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll
index ad7f6e7..0a9c8c1 100644
--- a/llvm/test/Transforms/LoopVectorize/pr28541.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll
@@ -28,7 +28,7 @@
 ; CHECK-NOT: vectorized loop
 ; CHECK-LABEL: fn1
 
-define i32 @fn1() {
+define void @fn1() {
 entry:
   %tmp2 = load i32, ptr @b, align 4
   %dec3 = add nsw i32 %tmp2, -1
@@ -67,5 +67,5 @@ while.cond.while.end_crit_edge:                   ; preds = %while.cond
   br label %while.end
 
 while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index f87be5a..6ea227f 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;     a[i] = b[i] * 3;
 ; }
 
-define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
+define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64, !dbg [[DBG4:![0-9]+]]
@@ -58,7 +58,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]], !dbg [[DBG14:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef, !dbg [[DBG14]]
+; CHECK-NEXT:    ret void, !dbg [[DBG14]]
 ;
 ; FORCED_OPTSIZE-LABEL: @foo(
 ; FORCED_OPTSIZE-NEXT:  entry:
@@ -80,7 +80,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
 ; FORCED_OPTSIZE:       for.end.loopexit:
 ; FORCED_OPTSIZE-NEXT:    br label [[FOR_END]], !dbg [[DBG10:![0-9]+]]
 ; FORCED_OPTSIZE:       for.end:
-; FORCED_OPTSIZE-NEXT:    ret i32 undef, !dbg [[DBG10]]
+; FORCED_OPTSIZE-NEXT:    ret void, !dbg [[DBG10]]
 ;
 entry:
   %cmp6 = icmp sgt i32 %n, 0, !dbg !6
@@ -99,7 +99,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.end, label %for.body, !dbg !7
 
 for.end:                                          ; preds = %for.body, %entry
-  ret i32 undef, !dbg !8
+  ret void, !dbg !8
 }
 
 ; Make sure that we try to vectorize loops with a runtime check if the
@@ -505,11 +505,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]]
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -524,7 +524,7 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4
 ; CHECK-NEXT:    [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP42:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll
index cc21b94..8df71e83 100644
--- a/llvm/test/Transforms/LoopVectorize/write-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/write-only.ll
@@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: @read_mod_write_single_ptr(
 ;CHECK: load <4 x float>
-;CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+;CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -21,14 +21,14 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
 ; Ensure that volatile stores are not vectorized.
 ; CHECK-LABEL: @read_mod_write_single_ptr_volatile_store(
 ; CHECK-NOT: store <4 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -44,5 +44,5 @@ define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) n
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s
new file mode 100644
index 0000000..02a52bb
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -triple=hexagon -mcpu=hexagonv75 -filetype=obj %s \
+// RUN:   | llvm-objdump -d - \
+// RUN:   | FileCheck %s
+
+foo:
+  { nop }
+  /// a nop without end-of-packet bits set to simulate data that is
+  /// not a proper packet end.
+  .long 0x7f004000
+bar:
+  { nop
+    nop
+  }
+
+// CHECK-LABEL: <foo>:
+// CHECK: { nop }
+// CHECK-NEXT: { nop
+
+/// The instruction starting after <bar> should start in a new packet.
+// CHECK-LABEL: <bar>:
+// CHECK: { nop
+// CHECK-NEXT: nop }
+
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index b7f898f..79216e8 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -40,6 +40,7 @@
 #include "llvm/ExecutionEngine/Orc/SectCreate.h"
 #include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h"
@@ -312,10 +313,19 @@ static cl::opt<bool>
                                cl::desc("Show FailedToMaterialize errors"),
                                cl::init(false), cl::cat(JITLinkCategory));
 
-static cl::opt<bool> UseSharedMemory(
-    "use-shared-memory",
-    cl::desc("Use shared memory to transfer generated code and data"),
-    cl::init(false), cl::cat(JITLinkCategory));
+enum class MemMgr { Default, Generic, SimpleRemote, Shared };
+
+static cl::opt<MemMgr> UseMemMgr(
+    "use-memmgr", cl::desc("Choose memory manager"), cl::init(MemMgr::Generic),
+    cl::values(clEnumValN(MemMgr::Default, "default",
+                          "Use setup default (InProcess or EPCGeneric)"),
+               clEnumValN(MemMgr::Generic, "generic",
+                          "Generic remote memory manager"),
+               clEnumValN(MemMgr::SimpleRemote, "simple-remote",
+                          "Mapper memory manager with simple-remote backend"),
+               clEnumValN(MemMgr::Shared, "shared",
+                          "Mapper memory manager with shared-memory manager")),
+    cl::cat(JITLinkCategory));
 
 static cl::opt<std::string>
     OverrideTriple("triple", cl::desc("Override target triple detection"),
@@ -718,6 +728,27 @@ static std::unique_ptr<JITLinkMemoryManager> createInProcessMemoryManager() {
 }
 
 Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
+createSimpleRemoteMemoryManager(SimpleRemoteEPC &SREPC) {
+  SimpleRemoteMemoryMapper::SymbolAddrs SAs;
+  if (auto Err = SREPC.getBootstrapSymbols(
+          {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
+           {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Deinitialize,
+            rt::SimpleExecutorMemoryManagerDeinitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
+    return std::move(Err);
+#ifdef _WIN32
+  size_t SlabSize = 1024 * 1024;
+#else
+  size_t SlabSize = 1024 * 1024 * 1024;
+#endif
+  return MapperJITLinkMemoryManager::CreateWithMapper<SimpleRemoteMemoryMapper>(
+      SlabSize, SREPC, SAs);
+}
+
+Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
   SharedMemoryMapper::SymbolAddrs SAs;
   if (auto Err = SREPC.getBootstrapSymbols(
@@ -745,6 +776,19 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
       SlabSize, SREPC, SAs);
 }
 
+static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
+  switch (UseMemMgr) {
+  case MemMgr::Default:
+  case MemMgr::Generic:
+    break;
+  case MemMgr::SimpleRemote:
+    S.CreateMemoryManager = createSimpleRemoteMemoryManager;
+    break;
+  case MemMgr::Shared:
+    S.CreateMemoryManager = createSharedMemoryManager;
+    break;
+  }
+}
 
 static Expected<MaterializationUnit::Interface>
 getTestObjectFileInterface(Session &S, MemoryBufferRef O) {
@@ -904,8 +948,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> launchExecutor() {
   close(FromExecutor[WriteEnd]);
 
   auto S = SimpleRemoteEPC::Setup();
-  if (UseSharedMemory)
-    S.CreateMemoryManager = createSharedMemoryManager;
+  setupEPCRemoteMemoryManager(S);
 
   return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
       std::make_unique<DynamicThreadPoolTaskDispatcher>(MaterializationThreads),
@@ -994,8 +1037,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> connectToExecutor() {
     return SockFD.takeError();
 
   auto S = SimpleRemoteEPC::Setup();
-  if (UseSharedMemory)
-    S.CreateMemoryManager = createSharedMemoryManager;
+  setupEPCRemoteMemoryManager(S);
 
   return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
       std::make_unique<DynamicThreadPoolTaskDispatcher>(std::nullopt),
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 46be539d..3ec644a 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -728,11 +728,17 @@ public:
     } while (!Comments.empty());
     FOS.flush();
   }
+
+  // Hook invoked when starting to disassemble a symbol at the current position.
+  // Default is no-op.
+  virtual void onSymbolStart() {}
 };
 PrettyPrinter PrettyPrinterInst;
 
 class HexagonPrettyPrinter : public PrettyPrinter {
 public:
+  void onSymbolStart() override { reset(); }
+
   void printLead(ArrayRef<uint8_t> Bytes, uint64_t Address,
                  formatted_raw_ostream &OS) {
     if (LeadingAddr)
@@ -2228,6 +2234,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         Start += Size;
         break;
       }
+      // Allow targets to reset any per-symbol state.
+      DT->Printer->onSymbolStart();
       formatted_raw_ostream FOS(OS);
       Index = Start;
       if (SectionAddr < StartAddress)
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
index 7340f56..04cd66c 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
@@ -420,12 +420,14 @@ TEST(LegalizerInfoTest, RuleSets) {
 
     // Raw type form
     LI.getActionDefinitionsBuilder(G_ADD)
-      .fewerElementsIf(typeIs(0, v4s32), changeElementCountTo(0, v2s32))
-      .fewerElementsIf(typeIs(0, v8s32), changeElementCountTo(0, s32))
-      .fewerElementsIf(typeIs(0, LLT::scalable_vector(4, 16)),
-                       changeElementCountTo(0, LLT::scalable_vector(2, 16)))
-      .fewerElementsIf(typeIs(0, LLT::scalable_vector(8, 16)),
-                       changeElementCountTo(0, s16));
+        .fewerElementsIf(typeIs(0, v4s32),
+                         changeElementCountTo(0, ElementCount::getFixed(2)))
+        .fewerElementsIf(typeIs(0, v8s32),
+                         changeElementCountTo(0, ElementCount::getFixed(1)))
+        .fewerElementsIf(typeIs(0, LLT::scalable_vector(4, s16)),
+                         changeElementCountTo(0, ElementCount::getScalable(2)))
+        .fewerElementsIf(typeIs(0, LLT::scalable_vector(8, s16)),
+                         changeElementCountTo(0, ElementCount::getFixed(1)));
 
     LegacyInfo.computeTables();
 
diff --git a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
index d4b45ea..2c6650d 100644
--- a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
@@ -39,8 +39,11 @@ public:
     return ExecutorAddr::fromPtr(MB.base());
   }
 
-  Error finalize(tpctypes::FinalizeRequest FR) {
+  Expected<ExecutorAddr> initialize(tpctypes::FinalizeRequest FR) {
+    assert(!FR.Segments.empty());
+    ExecutorAddr Base = FR.Segments[0].Addr;
     for (auto &Seg : FR.Segments) {
+      Base = std::min(Base, Seg.Addr);
       char *Mem = Seg.Addr.toPtr<char *>();
       memcpy(Mem, Seg.Content.data(), Seg.Content.size());
       memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
@@ -52,10 +55,10 @@ public:
       if ((Seg.RAG.Prot & MemProt::Exec) != MemProt::Exec)
         sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
     }
-    return Error::success();
+    return Base;
   }
 
-  Error deallocate(std::vector<ExecutorAddr> &Bases) {
+  Error release(std::vector<ExecutorAddr> &Bases) {
     Error Err = Error::success();
     for (auto &Base : Bases) {
       auto I = Blocks.find(Base.toPtr<void *>());
@@ -86,18 +89,18 @@ CWrapperFunctionResult testReserve(const char *ArgData, size_t ArgSize) {
           .release();
 }
 
-CWrapperFunctionResult testFinalize(const char *ArgData, size_t ArgSize) {
-  return WrapperFunction<rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+CWrapperFunctionResult testInitialize(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<
+             rt::SPSSimpleExecutorMemoryManagerInitializeSignature>::
       handle(ArgData, ArgSize,
-             makeMethodWrapperHandler(&SimpleAllocator::finalize))
+             makeMethodWrapperHandler(&SimpleAllocator::initialize))
           .release();
 }
 
-CWrapperFunctionResult testDeallocate(const char *ArgData, size_t ArgSize) {
-  return WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+CWrapperFunctionResult testRelease(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>::
       handle(ArgData, ArgSize,
-             makeMethodWrapperHandler(&SimpleAllocator::deallocate))
+             makeMethodWrapperHandler(&SimpleAllocator::release))
           .release();
 }
 
@@ -108,8 +111,8 @@ TEST(EPCGenericJITLinkMemoryManagerTest, AllocFinalizeFree) {
   EPCGenericJITLinkMemoryManager::SymbolAddrs SAs;
   SAs.Allocator = ExecutorAddr::fromPtr(&SA);
   SAs.Reserve = ExecutorAddr::fromPtr(&testReserve);
-  SAs.Finalize = ExecutorAddr::fromPtr(&testFinalize);
-  SAs.Deallocate = ExecutorAddr::fromPtr(&testDeallocate);
+  SAs.Initialize = ExecutorAddr::fromPtr(&testInitialize);
+  SAs.Release = ExecutorAddr::fromPtr(&testRelease);
 
   auto MemMgr = std::make_unique<EPCGenericJITLinkMemoryManager>(*SelfEPC, SAs);
   StringRef Hello = "hello";
diff --git a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
index 6e9b0b2..9c6f19c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
@@ -34,12 +34,12 @@ TEST(SimpleExecutorMemoryManagerTest, AllocFinalizeFree) {
   SimpleExecutorMemoryManager MemMgr;
 
   constexpr unsigned AllocSize = 16384;
-  auto Mem = MemMgr.allocate(AllocSize);
+  auto Mem = MemMgr.reserve(AllocSize);
   EXPECT_THAT_ERROR(Mem.takeError(), Succeeded());
 
   std::string HW = "Hello, world!";
 
-  int FinalizeCounter = 0;
+  int InitializeCounter = 0;
   int DeallocateCounter = 0;
 
   tpctypes::FinalizeRequest FR;
@@ -52,27 +52,27 @@ TEST(SimpleExecutorMemoryManagerTest, AllocFinalizeFree) {
       {/* Finalize: */
        cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
            ExecutorAddr::fromPtr(incrementWrapper),
-           ExecutorAddr::fromPtr(&FinalizeCounter))),
+           ExecutorAddr::fromPtr(&InitializeCounter))),
        /*  Deallocate: */
        cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
            ExecutorAddr::fromPtr(incrementWrapper),
            ExecutorAddr::fromPtr(&DeallocateCounter)))});
 
-  EXPECT_EQ(FinalizeCounter, 0);
+  EXPECT_EQ(InitializeCounter, 0);
   EXPECT_EQ(DeallocateCounter, 0);
 
-  auto FinalizeErr = MemMgr.finalize(FR);
-  EXPECT_THAT_ERROR(std::move(FinalizeErr), Succeeded());
+  auto InitializeErr = MemMgr.initialize(FR);
+  EXPECT_THAT_EXPECTED(std::move(InitializeErr), Succeeded());
 
-  EXPECT_EQ(FinalizeCounter, 1);
+  EXPECT_EQ(InitializeCounter, 1);
   EXPECT_EQ(DeallocateCounter, 0);
 
   EXPECT_EQ(HW, std::string(Mem->toPtr<const char *>()));
 
-  auto DeallocateErr = MemMgr.deallocate({*Mem});
-  EXPECT_THAT_ERROR(std::move(DeallocateErr), Succeeded());
+  auto ReleaseErr = MemMgr.release({*Mem});
+  EXPECT_THAT_ERROR(std::move(ReleaseErr), Succeeded());
 
-  EXPECT_EQ(FinalizeCounter, 1);
+  EXPECT_EQ(InitializeCounter, 1);
   EXPECT_EQ(DeallocateCounter, 1);
 }
 
diff --git a/llvm/unittests/IR/RuntimeLibcallsTest.cpp b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
index 26cb7e3..8925d2b 100644
--- a/llvm/unittests/IR/RuntimeLibcallsTest.cpp
+++ b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
@@ -44,9 +44,9 @@ TEST(RuntimeLibcallsTest, LibcallImplByName) {
         RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("sqrtl");
     ASSERT_EQ(size(SquirtleSquad), 3);
     auto I = SquirtleSquad.begin();
-    EXPECT_EQ(*I++, RTLIB::impl_sqrt_f128);
-    EXPECT_EQ(*I++, RTLIB::impl_sqrt_f80);
-    EXPECT_EQ(*I++, RTLIB::impl_sqrt_ppcf128);
+    EXPECT_EQ(*I++, RTLIB::impl_sqrtl_f128);
+    EXPECT_EQ(*I++, RTLIB::impl_sqrtl_f80);
+    EXPECT_EQ(*I++, RTLIB::impl_sqrtl_ppcf128);
   }
 
   // Last libcall
@@ -54,9 +54,9 @@ TEST(RuntimeLibcallsTest, LibcallImplByName) {
     auto Truncs = RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("truncl");
     ASSERT_EQ(size(Truncs), 3);
     auto I = Truncs.begin();
-    EXPECT_EQ(*I++, RTLIB::impl_trunc_f128);
-    EXPECT_EQ(*I++, RTLIB::impl_trunc_f80);
-    EXPECT_EQ(*I++, RTLIB::impl_trunc_ppcf128);
+    EXPECT_EQ(*I++, RTLIB::impl_truncl_f128);
+    EXPECT_EQ(*I++, RTLIB::impl_truncl_f80);
+    EXPECT_EQ(*I++, RTLIB::impl_truncl_ppcf128);
   }
 }
 
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index e9fd132..406a728 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -391,7 +391,7 @@ You can test this locally with the following command:
             return None
 
         # Use git to find files that have had a change in the number of undefs
-        regex = "([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)"
+        regex = "([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)"
         cmd = ["git", "diff", "-U0", "--pickaxe-regex", "-S", regex]
 
         if args.start_rev and args.end_rev:
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index 9b69a44..8438421 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -68,6 +68,7 @@ static_library("Orc") {
     "SectCreate.cpp",
     "SelfExecutorProcessControl.cpp",
     "SimpleRemoteEPC.cpp",
+    "SimpleRemoteMemoryMapper.cpp",
     "SpeculateAnalyses.cpp",
     "Speculation.cpp",
     "TaskDispatch.cpp",
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
index 10491f6..4ecf03c 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
@@ -50,28 +50,63 @@ TargetEnvAttr getDefaultTargetEnv(MLIRContext *context);
 /// returned by getDefaultTargetEnv() if not provided.
 TargetEnvAttr lookupTargetEnvOrDefault(Operation *op);
 
+/// A thin wrapper around the SpecificationVersion enum to represent
+/// and provide utilities around the TOSA specification version.
+class TosaSpecificationVersion {
+public:
+  TosaSpecificationVersion(uint32_t major, uint32_t minor)
+      : majorVersion(major), minorVersion(minor) {}
+  TosaSpecificationVersion(SpecificationVersion version)
+      : TosaSpecificationVersion(fromVersionEnum(version)) {}
+
+  bool isBackwardsCompatibleWith(TosaSpecificationVersion baseVersion) const {
+    return this->majorVersion == baseVersion.majorVersion &&
+           this->minorVersion >= baseVersion.minorVersion;
+  }
+
+  uint32_t getMajor() const { return majorVersion; }
+  uint32_t getMinor() const { return minorVersion; }
+
+private:
+  uint32_t majorVersion = 0;
+  uint32_t minorVersion = 0;
+
+  static TosaSpecificationVersion
+  fromVersionEnum(SpecificationVersion version) {
+    switch (version) {
+    case SpecificationVersion::V_1_0:
+      return TosaSpecificationVersion(1, 0);
+    case SpecificationVersion::V_1_1_DRAFT:
+      return TosaSpecificationVersion(1, 1);
+    }
+    llvm_unreachable("Unknown TOSA version");
+  }
+};
+
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version);
+
 /// This class represents the capability enabled in the target implementation
 /// such as profile, extension, and level. It's a wrapper class around
 /// tosa::TargetEnvAttr.
 class TargetEnv {
 public:
   TargetEnv() {}
-  explicit TargetEnv(Level level, const ArrayRef<Profile> &profiles,
+  explicit TargetEnv(SpecificationVersion specificationVersion, Level level,
+                     const ArrayRef<Profile> &profiles,
                      const ArrayRef<Extension> &extensions)
-      : level(level) {
+      : specificationVersion(specificationVersion), level(level) {
     enabledProfiles.insert_range(profiles);
     enabledExtensions.insert_range(extensions);
   }
 
   explicit TargetEnv(TargetEnvAttr targetAttr)
-      : TargetEnv(targetAttr.getLevel(), targetAttr.getProfiles(),
-                  targetAttr.getExtensions()) {}
+      : TargetEnv(targetAttr.getSpecificationVersion(), targetAttr.getLevel(),
+                  targetAttr.getProfiles(), targetAttr.getExtensions()) {}
 
   void addProfile(Profile p) { enabledProfiles.insert(p); }
   void addExtension(Extension e) { enabledExtensions.insert(e); }
 
-  // TODO implement the following utilities.
-  // Version getSpecVersion() const;
+  SpecificationVersion getSpecVersion() const { return specificationVersion; }
 
   TosaLevel getLevel() const {
     if (level == Level::eightK)
@@ -105,6 +140,7 @@ public:
   }
 
 private:
+  SpecificationVersion specificationVersion;
   Level level;
   llvm::SmallSet<Profile, 3> enabledProfiles;
   llvm::SmallSet<Extension, 13> enabledExtensions;
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index 1f718ac..c1b5e78 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -2,441 +2,779 @@
 // `tools/genspec.py` in https://git.mlplatform.org/tosa/specification.git
 profileComplianceMap = {
     {"tosa.argmax",
-     {{{Profile::pro_int}, {{i8T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, i32T}, {fp32T, i32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, i32T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.avg_pool2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i8T, i32T, i8T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i8T, i32T, i8T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp32T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.conv3d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.depthwise_conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.matmul",
-     {{{Profile::pro_int}, {{i8T, i8T, i8T, i8T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i8T, i8T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp32T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.max_pool2d",
-     {{{Profile::pro_int}, {{i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose_conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.clamp",
-     {{{Profile::pro_int}, {{i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.erf", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.sigmoid", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.tanh", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.erf",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sigmoid",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.tanh",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.add",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.arithmetic_right_shift",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_and",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_or",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_xor",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.intdiv",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_and",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_left_shift",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf}}},
     {"tosa.logical_right_shift",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf}}},
     {"tosa.logical_or",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_xor",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.maximum",
-     {{{Profile::pro_int}, {{i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.minimum",
-     {{{Profile::pro_int}, {{i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.mul",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T}, {i16T, i16T, i32T}}},
-      {{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pow",
-     {{{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.sub",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
-    {"tosa.table", {{{Profile::pro_int}, {{i8T, i8T, i8T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.table",
+     {{{Profile::pro_int}, {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.abs",
-     {{{Profile::pro_int}, {{i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_not",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}}}},
-    {"tosa.ceil", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.clz", {{{Profile::pro_int}, {{i32T, i32T}}}}},
-    {"tosa.cos", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.exp", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.floor", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.log", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.ceil",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.clz",
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.cos",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.exp",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.floor",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.log",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.logical_not",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.negate",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T, i8T},
-        {i16T, i16T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}},
+       {{{i8T, i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reciprocal",
-     {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.rsqrt", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.sin", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rsqrt",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sin",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.select",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
       {{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.equal",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater_equal",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_all",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.reduce_any",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.reduce_max",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_min",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_product",
-     {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_sum",
-     {{{Profile::pro_int}, {{i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.concat",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pad",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
       {{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reshape",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reverse",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.slice",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.tile",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.gather",
      {{{Profile::pro_int},
-       {{i8T, i32T, i8T}, {i16T, i32T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, i32T, fp16T}, {fp32T, i32T, fp32T}}}}},
+       {{{i8T, i32T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, i32T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.scatter",
      {{{Profile::pro_int},
-       {{i8T, i32T, i8T, i8T},
-        {i16T, i32T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}},
+       {{{i8T, i32T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, i32T, fp16T, fp16T}, {fp32T, i32T, fp32T, fp32T}}}}},
+       {{{fp16T, i32T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.resize",
-     {{{Profile::pro_int}, {{i8T, i32T}, {i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.cast",
      {{{Profile::pro_int},
-       {{boolT, i8T},
-        {boolT, i16T},
-        {boolT, i32T},
-        {i8T, boolT},
-        {i8T, i16T},
-        {i8T, i32T},
-        {i16T, boolT},
-        {i16T, i8T},
-        {i16T, i32T},
-        {i32T, boolT},
-        {i32T, i8T},
-        {i32T, i16T}}},
-      {{Profile::pro_fp},
-       {{i8T, fp16T},
-        {i8T, fp32T},
-        {i16T, fp16T},
-        {i16T, fp32T},
-        {i32T, fp16T},
-        {i32T, fp32T},
-        {fp16T, i8T},
-        {fp16T, i16T},
-        {fp16T, i32T},
-        {fp16T, fp32T},
-        {fp32T, i8T},
-        {fp32T, i16T},
-        {fp32T, i32T},
-        {fp32T, fp16T}}}}},
+       {{{boolT, i8T}, SpecificationVersion::V_1_0},
+        {{boolT, i16T}, SpecificationVersion::V_1_0},
+        {{boolT, i32T}, SpecificationVersion::V_1_0},
+        {{i8T, boolT}, SpecificationVersion::V_1_0},
+        {{i8T, i16T}, SpecificationVersion::V_1_0},
+        {{i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, boolT}, SpecificationVersion::V_1_0},
+        {{i16T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T}, SpecificationVersion::V_1_0},
+        {{i32T, boolT}, SpecificationVersion::V_1_0},
+        {{i32T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{i8T, fp16T}, SpecificationVersion::V_1_0},
+        {{i8T, fp32T}, SpecificationVersion::V_1_0},
+        {{i16T, fp16T}, SpecificationVersion::V_1_0},
+        {{i16T, fp32T}, SpecificationVersion::V_1_0},
+        {{i32T, fp16T}, SpecificationVersion::V_1_0},
+        {{i32T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, i8T}, SpecificationVersion::V_1_0},
+        {{fp16T, i16T}, SpecificationVersion::V_1_0},
+        {{fp16T, i32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, i8T}, SpecificationVersion::V_1_0},
+        {{fp32T, i16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.rescale",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T, i8T},
-        {i8T, i8T, i16T, i16T},
-        {i8T, i8T, i32T, i32T},
-        {i16T, i16T, i8T, i8T},
-        {i16T, i16T, i16T, i16T},
-        {i16T, i16T, i32T, i32T},
-        {i32T, i32T, i8T, i8T},
-        {i32T, i32T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i32T, i32T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.const",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{boolT}, {i8T}, {i16T}, {i32T}},
+       {{{boolT}, SpecificationVersion::V_1_0},
+        {{i8T}, SpecificationVersion::V_1_0},
+        {{i16T}, SpecificationVersion::V_1_0},
+        {{i32T}, SpecificationVersion::V_1_0}},
        anyOf},
-      {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.identity",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{boolT, boolT}, {i8T, i8T}, {i16T, i16T}, {i32T, i32T}},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0},
+        {{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_write",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_read",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
 };
 
 extensionComplianceMap = {
     {"tosa.argmax",
-     {{{Extension::int16}, {{i16T, i32T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, i32T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T}}},
-      {{Extension::bf16}, {{bf16T, i32T}}}}},
+     {{{Extension::int16}, {{{i16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3}, {{{fp8e4m3T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2}, {{{fp8e5m2T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.avg_pool2d",
-     {{{Extension::int16}, {{i16T, i16T, i16T, i32T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+     {{{Extension::int16},
+       {{{i16T, i16T, i16T, i32T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T},
+         SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T},
+         SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, fp32T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.conv3d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.depthwise_conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
-    {"tosa.fft2d", {{{Extension::fft}, {{fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
+    {"tosa.fft2d",
+     {{{Extension::fft},
+       {{{fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.matmul",
-     {{{Extension::int16}, {{i16T, i16T, i16T, i16T, i48T}}},
+     {{{Extension::int16},
+       {{{i16T, i16T, i16T, i16T, i48T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T},
-        {fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp32T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T},
-        {fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp32T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3, Extension::fp8e5m2},
-       {{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp16T},
-        {fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp32T},
-        {fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp16T},
-        {fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp32T}},
+       {{{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp16T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp16T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}},
        allOf},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T, bf16T, fp32T}}}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, bf16T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.max_pool2d",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.rfft2d", {{{Extension::fft}, {{fp32T, fp32T, fp32T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rfft2d",
+     {{{Extension::fft},
+       {{{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose_conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.clamp",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.erf", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.sigmoid", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.tanh", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.add", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.maximum", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.minimum", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.mul", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.pow", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.sub", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.table", {{{Extension::int16}, {{i16T, i16T, i32T}}}}},
-    {"tosa.abs", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.ceil", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.cos", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.exp", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.floor", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.log", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.negate", {{{Extension::bf16}, {{bf16T, bf16T, bf16T, bf16T}}}}},
-    {"tosa.reciprocal", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.rsqrt", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.sin", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.select", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.equal", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.greater", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.greater_equal", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.reduce_max", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_min", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_product", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_sum", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.erf",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sigmoid",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.tanh",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.add",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.maximum",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.minimum",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.mul",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.pow",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sub",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.table",
+     {{{Extension::int16},
+       {{{i16T, i16T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.abs",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.ceil",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.cos",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.exp",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.floor",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.log",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.negate",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reciprocal",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rsqrt",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sin",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.select",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.equal",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.greater",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.greater_equal",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_max",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_min",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_product",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_sum",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.concat",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pad",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reshape",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reverse",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.slice",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.tile",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.gather",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, i32T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, i32T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, i32T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, i32T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, i32T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.scatter",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, i32T, fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T, fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, i32T, bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, i32T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, i32T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, i32T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.resize",
-     {{{Extension::int16}, {{i16T, i48T}, {i16T, i16T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16},
+       {{{i16T, i48T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.cast",
      {{{Extension::bf16},
-       {{i8T, bf16T},
-        {i16T, bf16T},
-        {i32T, bf16T},
-        {bf16T, i8T},
-        {bf16T, i16T},
-        {bf16T, i32T},
-        {bf16T, fp32T},
-        {fp32T, bf16T}}},
+       {{{i8T, bf16T}, SpecificationVersion::V_1_0},
+        {{i16T, bf16T}, SpecificationVersion::V_1_0},
+        {{i32T, bf16T}, SpecificationVersion::V_1_0},
+        {{bf16T, i8T}, SpecificationVersion::V_1_0},
+        {{bf16T, i16T}, SpecificationVersion::V_1_0},
+        {{bf16T, i32T}, SpecificationVersion::V_1_0},
+        {{bf16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, bf16T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16, Extension::fp8e4m3},
-       {{bf16T, fp8e4m3T}, {fp8e4m3T, bf16T}},
+       {{{bf16T, fp8e4m3T}, SpecificationVersion::V_1_0},
+        {{fp8e4m3T, bf16T}, SpecificationVersion::V_1_0}},
        allOf},
       {{Extension::bf16, Extension::fp8e5m2},
-       {{bf16T, fp8e5m2T}, {fp8e5m2T, bf16T}},
+       {{{bf16T, fp8e5m2T}, SpecificationVersion::V_1_0},
+        {{fp8e5m2T, bf16T}, SpecificationVersion::V_1_0}},
        allOf},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp16T},
-        {fp8e4m3T, fp32T},
-        {fp16T, fp8e4m3T},
-        {fp32T, fp8e4m3T}}},
+       {{{fp8e4m3T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp8e4m3T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp8e4m3T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp16T},
-        {fp8e5m2T, fp32T},
-        {fp16T, fp8e5m2T},
-        {fp32T, fp8e5m2T}}}}},
+       {{{fp8e5m2T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp8e5m2T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp8e5m2T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp8e5m2T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.rescale",
      {{{Extension::int16},
-       {{i48T, i48T, i8T, i8T},
-        {i48T, i48T, i16T, i16T},
-        {i48T, i48T, i32T, i32T}}}}},
+       {{{i48T, i48T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i48T, i48T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i48T, i48T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.const",
-     {{{Extension::int4}, {{i4T}}},
-      {{Extension::int16}, {{i48T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T}}}}},
+     {{{Extension::int4}, {{{i4T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16}, {{{i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3}, {{{fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2}, {{{fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.identity",
-     {{{Extension::int4}, {{i4T, i4T}}},
-      {{Extension::int16}, {{i48T, i48T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.variable", {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::int4}, {{{i4T, i4T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16}, {{{i48T, i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.variable",
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_write",
-     {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_read",
-     {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
 };
+
 // End of auto-generated metadata
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 38cb293..8376a4c 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -221,7 +221,7 @@ class Tosa_I32EnumAttr<string name, string description, string mnemonic,
 }
 
 //===----------------------------------------------------------------------===//
-// TOSA Spec Section 1.5.
+// TOSA Profiles and extensions
 //
 // Profile:
 // INT : Integer Inference. Integer operations, primarily 8 and 32-bit values.
@@ -293,12 +293,6 @@ def Tosa_ExtensionAttr
 def Tosa_ExtensionArrayAttr
     : TypedArrayAttrBase<Tosa_ExtensionAttr, "TOSA extension array attribute">;
 
-def Tosa_LVL_NONE : I32EnumAttrCase<"none", 0>;
-def Tosa_LVL_8K   : I32EnumAttrCase<"eightK", 1, "8k">;
-
-def Tosa_LevelAttr
-    : Tosa_I32EnumAttr<"Level", "supported TOSA levels", "level", [Tosa_LVL_NONE, Tosa_LVL_8K]>;
-
 // The base class for defining op availability dimensions.
 class Availability {
   // The following are fields for controlling the generated C++ OpInterface.
@@ -405,17 +399,40 @@ class Extension<list<I32EnumAttrCase> extensions> : Availability {
 }
 
 //===----------------------------------------------------------------------===//
+// TOSA Levels
+//===----------------------------------------------------------------------===//
+
+def Tosa_LVL_NONE : I32EnumAttrCase<"none", 0>;
+def Tosa_LVL_8K   : I32EnumAttrCase<"eightK", 1, "8k">;
+
+def Tosa_LevelAttr
+    : Tosa_I32EnumAttr<"Level", "supported TOSA levels", "level", [Tosa_LVL_NONE, Tosa_LVL_8K]>;
+
+//===----------------------------------------------------------------------===//
+// TOSA Specification versions
+//===----------------------------------------------------------------------===//
+
+def Tosa_V_1_0 : I32EnumAttrCase<"V_1_0", 0, "1.0">;
+def Tosa_V_1_1_DRAFT : I32EnumAttrCase<"V_1_1_DRAFT", 1, "1.1.draft">;
+
+def Tosa_SpecificationVersion : Tosa_I32EnumAttr<
+      "SpecificationVersion", "TOSA specification version", "specification_version",
+      [Tosa_V_1_0, Tosa_V_1_1_DRAFT]>;
+
+//===----------------------------------------------------------------------===//
 // TOSA target environment.
 //===----------------------------------------------------------------------===//
 def Tosa_TargetEnv : Tosa_Attr<"TargetEnv", "target_env"> {
   let summary = "Target environment information.";
   let parameters = ( ins
+    "SpecificationVersion": $specification_version,
     "Level": $level,
     ArrayRefParameter<"Profile">: $profiles,
     ArrayRefParameter<"Extension">: $extensions
   );
 
-  let assemblyFormat = "`<` `level` `=` $level `,` `profiles` `=` `[` $profiles `]` `,` "
+  let assemblyFormat = "`<` `specification_version` `=` $specification_version `,` "
+                       "`level` `=` $level `,` `profiles` `=` `[` $profiles `]` `,` "
                        "`extensions` `=` `[` $extensions `]` `>`";
 }
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
index 8f5c72b..7b946ad 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
@@ -36,12 +36,15 @@ enum CheckCondition {
   allOf
 };
 
+using VersionedTypeInfo =
+    std::pair<SmallVector<TypeInfo>, SpecificationVersion>;
+
 template <typename T>
 struct OpComplianceInfo {
   // Certain operations require multiple modes enabled.
   // e.g. cast bf16 to fp8e4m3 requires EXT-BF16 and EXT-FP8E4M3.
   SmallVector<T> mode;
-  SmallVector<SmallVector<TypeInfo>> operandTypeInfoSet;
+  SmallVector<VersionedTypeInfo> operandTypeInfoSet;
   CheckCondition condition = CheckCondition::anyOf;
 };
 
@@ -130,9 +133,8 @@ public:
   // Find the required profiles or extensions from the compliance info according
   // to the operand type combination.
   template <typename T>
-  SmallVector<T> findMatchedProfile(Operation *op,
-                                    SmallVector<OpComplianceInfo<T>> compInfo,
-                                    CheckCondition &condition);
+  OpComplianceInfo<T>
+  findMatchedEntry(Operation *op, SmallVector<OpComplianceInfo<T>> compInfo);
 
   SmallVector<Profile> getCooperativeProfiles(Extension ext) {
     switch (ext) {
@@ -168,8 +170,7 @@ public:
 
 private:
   template <typename T>
-  FailureOr<SmallVector<T>> getOperatorDefinition(Operation *op,
-                                                  CheckCondition &condition);
+  FailureOr<OpComplianceInfo<T>> getOperatorDefinition(Operation *op);
 
   OperationProfileComplianceMap profileComplianceMap;
   OperationExtensionComplianceMap extensionComplianceMap;
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index 6ae19d8..14b00b0 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -137,6 +137,13 @@ def TosaAttachTarget : Pass<"tosa-attach-target", "ModuleOp"> {
   ];
 
   let options = [
+    Option<"specificationVersion", "specification_version", "mlir::tosa::SpecificationVersion",
+              /*default=*/"mlir::tosa::SpecificationVersion::V_1_0",
+              "The specification version that TOSA operators should conform to.",
+              [{::llvm::cl::values(
+                clEnumValN(mlir::tosa::SpecificationVersion::V_1_0, "1.0", "TOSA Specification version 1.0"),
+                clEnumValN(mlir::tosa::SpecificationVersion::V_1_1_DRAFT, "1.1.draft", "TOSA Specification version 1.1.draft")
+              )}]>,
     Option<"level", "level", "mlir::tosa::Level",
               /*default=*/"mlir::tosa::Level::eightK",
               "The TOSA level that operators should conform to. A TOSA level defines "
diff --git a/mlir/include/mlir/TableGen/CodeGenHelpers.h b/mlir/include/mlir/TableGen/CodeGenHelpers.h
index 252da21..997aef2 100644
--- a/mlir/include/mlir/TableGen/CodeGenHelpers.h
+++ b/mlir/include/mlir/TableGen/CodeGenHelpers.h
@@ -88,7 +88,7 @@ public:
   ///
   /// Constraints that do not meet the restriction that they can only reference
   /// `$_self` and `$_op` are not uniqued.
-  void emitOpConstraints(ArrayRef<const llvm::Record *> opDefs);
+  void emitOpConstraints();
 
   /// Unique all compatible type and attribute constraints from a pattern file
   /// and emit them at the top of the generated file.
diff --git a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
index 5aad671..1cba1bb 100644
--- a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Tosa/IR/TargetEnv.h"
+#include "llvm/Support/FormatVariadic.h"
 
 namespace mlir {
 namespace tosa {
@@ -27,7 +28,7 @@ TargetEnvAttr lookupTargetEnv(Operation *op) {
 }
 
 TargetEnvAttr getDefaultTargetEnv(MLIRContext *context) {
-  return TargetEnvAttr::get(context, Level::eightK,
+  return TargetEnvAttr::get(context, SpecificationVersion::V_1_0, Level::eightK,
                             {Profile::pro_int, Profile::pro_fp}, {});
 }
 
@@ -38,5 +39,9 @@ TargetEnvAttr lookupTargetEnvOrDefault(Operation *op) {
   return getDefaultTargetEnv(op->getContext());
 }
 
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version) {
+  return llvm::formatv("{0}.{1}", version.getMajor(), version.getMinor());
+}
+
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
index bcb880a..a0661e4 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
@@ -61,8 +61,8 @@ public:
 
     ModuleOp mod = getOperation();
     MLIRContext *ctx = &getContext();
-    const auto targetEnvAttr =
-        TargetEnvAttr::get(ctx, level, selectedProfiles, selectedExtensions);
+    const auto targetEnvAttr = TargetEnvAttr::get(
+        ctx, specificationVersion, level, selectedProfiles, selectedExtensions);
     mod->setAttr(TargetEnvAttr::name, targetEnvAttr);
   }
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index 20f9333..f072e3e 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -335,16 +335,15 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 //===----------------------------------------------------------------------===//
 
 template <typename T>
-FailureOr<SmallVector<T>>
-TosaProfileCompliance::getOperatorDefinition(Operation *op,
-                                             CheckCondition &condition) {
+FailureOr<OpComplianceInfo<T>>
+TosaProfileCompliance::getOperatorDefinition(Operation *op) {
   const std::string opName = op->getName().getStringRef().str();
   const auto complianceMap = getProfileComplianceMap<T>();
   const auto it = complianceMap.find(opName);
   if (it == complianceMap.end())
     return {};
 
-  return findMatchedProfile<T>(op, it->second, condition);
+  return findMatchedEntry<T>(op, it->second);
 }
 
 template <typename T>
@@ -356,22 +355,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
   if (specRequiredModeSet.size() == 0)
     return success();
 
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeOpRequiredMode = getOperatorDefinition<T>(op, condition);
-  if (failed(maybeOpRequiredMode)) {
+  const auto maybeOpDefinition = getOperatorDefinition<T>(op);
+  if (failed(maybeOpDefinition)) {
     // Operators such as control-flow and shape ops do not have an operand type
     // restriction. When the profile compliance information of operation is not
     // found, confirm if the target have enabled the profile required from the
     // specification.
-    int mode_count = 0;
+    int modeCount = 0;
     for (const auto &cands : specRequiredModeSet) {
       if (targetEnv.allowsAnyOf(cands))
         return success();
-      mode_count += cands.size();
+      modeCount += cands.size();
     }
 
     op->emitOpError() << "illegal: requires"
-                      << (mode_count > 1 ? " any of " : " ") << "["
+                      << (modeCount > 1 ? " any of " : " ") << "["
                       << llvm::join(stringifyProfile<T>(specRequiredModeSet),
                                     ", ")
                       << "] but not enabled in target\n";
@@ -381,7 +379,10 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
 
   // Find the required profiles or extensions according to the operand type
   // combination.
-  const auto opRequiredMode = maybeOpRequiredMode.value();
+  const auto opDefinition = maybeOpDefinition.value();
+  const SmallVector<T> opRequiredMode = opDefinition.mode;
+  const CheckCondition condition = opDefinition.condition;
+
   if (opRequiredMode.size() == 0) {
     // No matched restriction found.
     return success();
@@ -437,6 +438,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
     }
   }
 
+  // Ensure the matched op compliance version does not exceed the target
+  // specification version.
+  const VersionedTypeInfo versionedTypeInfo =
+      opDefinition.operandTypeInfoSet[0];
+  const TosaSpecificationVersion complianceVersion{versionedTypeInfo.second};
+  const TosaSpecificationVersion targetVersion{targetEnv.getSpecVersion()};
+  if (!targetVersion.isBackwardsCompatibleWith(complianceVersion)) {
+    op->emitOpError() << "illegal: the target specification version ("
+                      << stringifyVersion(targetVersion)
+                      << ") is not backwards compatible with the op compliance "
+                         "specification version ("
+                      << stringifyVersion(complianceVersion) << ")\n";
+    return failure();
+  }
+
   return success();
 }
 
@@ -461,14 +477,14 @@ TosaProfileCompliance::checkExtension(Operation *op,
 }
 
 LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeProfDef = getOperatorDefinition<Profile>(op, condition);
-  const auto maybeExtDef = getOperatorDefinition<Extension>(op, condition);
+  const auto maybeProfDef = getOperatorDefinition<Profile>(op);
+  const auto maybeExtDef = getOperatorDefinition<Extension>(op);
   if (failed(maybeProfDef) && failed(maybeExtDef))
     return success();
 
-  const bool hasEntry = (succeeded(maybeProfDef) && !maybeProfDef->empty()) ||
-                        (succeeded(maybeExtDef) && !maybeExtDef->empty());
+  const bool hasEntry =
+      (succeeded(maybeProfDef) && !maybeProfDef->mode.empty()) ||
+      (succeeded(maybeExtDef) && !maybeExtDef->mode.empty());
   if (!hasEntry) {
     std::string message;
     llvm::raw_string_ostream os(message);
@@ -488,7 +504,9 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
     SmallVector<TypeInfo> bestTypeInfo;
     const auto searchBestMatch = [&](auto map) {
       for (const auto &complianceInfos : map[opName]) {
-        for (const auto &typeInfos : complianceInfos.operandTypeInfoSet) {
+        for (const auto &versionedTypeInfos :
+             complianceInfos.operandTypeInfoSet) {
+          const SmallVector<TypeInfo> typeInfos = versionedTypeInfos.first;
           const int matches = llvm::count_if(
               llvm::zip_equal(current, typeInfos), [&](const auto zipType) {
                 return isSameTypeInfo(std::get<0>(zipType),
@@ -520,9 +538,8 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
 // Find the profiles or extensions requirement according to the signature of
 // type of the operand list.
 template <typename T>
-SmallVector<T> TosaProfileCompliance::findMatchedProfile(
-    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo,
-    CheckCondition &condition) {
+OpComplianceInfo<T> TosaProfileCompliance::findMatchedEntry(
+    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo) {
   assert(compInfo.size() != 0 &&
          "profile-based compliance information is empty");
 
@@ -533,27 +550,30 @@ SmallVector<T> TosaProfileCompliance::findMatchedProfile(
     return {};
 
   for (size_t i = 0; i < compInfo.size(); i++) {
-    SmallVector<SmallVector<TypeInfo>> sets = compInfo[i].operandTypeInfoSet;
-    for (SmallVector<TypeInfo> expected : sets) {
+    SmallVector<VersionedTypeInfo> sets = compInfo[i].operandTypeInfoSet;
+    for (const auto &set : sets) {
+      SmallVector<TypeInfo> expected = set.first;
       assert(present.size() == expected.size() &&
              "the entries for profile-based compliance do not match between "
              "the generated metadata and the type definition retrieved from "
              " the operation");
 
-      bool is_found = true;
+      bool isFound = true;
       // Compare the type signature between the given operation and the
       // compliance metadata.
       for (size_t j = 0; j < expected.size(); j++) {
         if (!isSameTypeInfo(present[j], expected[j])) {
           // Verify the next mode set from the list.
-          is_found = false;
+          isFound = false;
           break;
         }
       }
 
-      if (is_found == true) {
-        condition = compInfo[i].condition;
-        return compInfo[i].mode;
+      if (isFound == true) {
+        SmallVector<VersionedTypeInfo> typeInfoSet{set};
+        OpComplianceInfo<T> info{compInfo[i].mode, typeInfoSet,
+                                 compInfo[i].condition};
+        return info;
       }
     }
   }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 81b5788..e0a8ac4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -20,8 +20,8 @@
 
 #define DEBUG_TYPE "xegpu"
 
-namespace mlir {
-namespace xegpu {
+using namespace mlir;
+using namespace mlir::xegpu;
 
 static bool isSharedMemory(const MemRefType &memrefTy) {
   Attribute attr = memrefTy.getMemorySpace();
@@ -1133,9 +1133,6 @@ LogicalResult MemDescSubviewOp::verify() {
   return success();
 }
 
-} // namespace xegpu
-} // namespace mlir
-
 namespace mlir {
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
 } // namespace mlir
diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp
index cb90ef8..d52d5e7 100644
--- a/mlir/lib/TableGen/CodeGenHelpers.cpp
+++ b/mlir/lib/TableGen/CodeGenHelpers.cpp
@@ -49,9 +49,7 @@ StaticVerifierFunctionEmitter::StaticVerifierFunctionEmitter(
     raw_ostream &os, const RecordKeeper &records, StringRef tag)
     : os(os), uniqueOutputLabel(getUniqueOutputLabel(records, tag)) {}
 
-void StaticVerifierFunctionEmitter::emitOpConstraints(
-    ArrayRef<const Record *> opDefs) {
-  NamespaceEmitter namespaceEmitter(os, Operator(*opDefs[0]).getCppNamespace());
+void StaticVerifierFunctionEmitter::emitOpConstraints() {
   emitTypeConstraints();
   emitAttrConstraints();
   emitPropConstraints();
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
index 35f520a..93a0336 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
@@ -1,5 +1,9 @@
 // RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.dot
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_dot
 func.func @contraction_dot(%A: memref<1584xf32>, %B: memref<1584xf32>, %C: memref<f32>) {
 
@@ -20,6 +24,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matvec
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_matvec
 func.func @contraction_matvec(%A: memref<1584x1584xf32>, %B: memref<1584xf32>, %C: memref<1584xf32>) {
 
@@ -41,6 +49,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matmul
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_matmul
 func.func @contraction_matmul(%A: memref<1584x1584xf32>, %B: memref<1584x1584xf32>, %C: memref<1584x1584xf32>) {
 // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584xf32>
@@ -138,6 +150,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.batch_matmul
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_batch_matmul
 func.func @contraction_batch_matmul(%A: memref<1584x1584x1584xf32>, %B: memref<1584x1584x1584xf32>, %C: memref<1584x1584x1584xf32>) {
 // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584x1584xf32>
@@ -159,6 +175,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.cantract
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: @matmul_as_contract
 // CHECK-SAME: %[[A:.*]]: tensor<24x12xf32>
 // CHECK-SAME: %[[B:.*]]: tensor<12x25xf32>
@@ -220,6 +240,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.fill
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: func @test_vectorize_fill
 func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) {
   //       CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32>
@@ -259,70 +283,14 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_copy
-func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
-  //       CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
-  //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
-  memref.copy %A, %B :  memref<8x16xf32> to memref<8x16xf32>
-  return
-}
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.pack
+///----------------------------------------------------------------------------------------
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
+// Note, see a similar test in:
+//  * vectorization.mlir.
 
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_0d
-func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
-  //  CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
-  //       CHECK:   %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
-  //       CHECK:   %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
-  //       CHECK:   %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
-  //       CHECK:   vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
-  memref.copy %A, %B :  memref<f32> to memref<f32>
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_complex
-// CHECK-NOT: vector<
-func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
-  memref.copy %A, %B :  memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// Input identical as the test in vectorization.mlir. Output is different -
-// vector sizes are inferred (rather than user-specified) and hence _no_
-// masking was used.
-
-func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+func.func @pack_no_padding(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
   %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
@@ -336,7 +304,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// CHECK-LABEL:   func.func @test_vectorize_pack(
+// CHECK-LABEL:   func.func @pack_no_padding(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x8x16xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
 // CHECK-DAG:       %[[VAL_2:.*]] = ub.poison : f32
@@ -349,13 +317,16 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// Note, see a similar test in:
+//  * vectorization.mlir.
+
+func.func @pack_with_padding(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
   %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
 
-// CHECK-LABEL:   func.func @test_vectorize_padded_pack(
+// CHECK-LABEL:   func.func @pack_with_padding(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x7x15xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
@@ -377,6 +348,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.map
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_map(%arg0: memref<64xf32>,
     %arg1: memref<64xf32>, %arg2: memref<64xf32>) {
   linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>)
@@ -403,6 +378,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.transpose
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>,
                                %arg1: memref<32x64x16xf32>) {
   linalg.transpose ins(%arg0 : memref<16x32x64xf32>)
@@ -424,6 +403,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.reduce
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>,
                   %arg1: memref<16x64xf32>) {
   linalg.reduce ins(%arg0 : memref<16x32x64xf32>)
@@ -449,6 +432,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.generic
+///----------------------------------------------------------------------------------------
+
 #matmul_trait = {
   indexing_maps = [
     affine_map<(m, n, k) -> (m, k)>,
@@ -1446,6 +1433,8 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// TODO: Two Linalg Ops in one tests - either split or document "why".
+
 // CHECK-DAG: #[[$M6:.*]] = affine_map<(d0, d1) -> (d0, 0)>
 
 // CHECK-LABEL:   func @fused_broadcast_red_2d
@@ -1896,3 +1885,65 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
+// -----
+
+///----------------------------------------------------------------------------------------
+/// Tests for memref.copy
+///----------------------------------------------------------------------------------------
+
+// CHECK-LABEL: func @test_vectorize_copy
+func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
+  //       CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
+  //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+  memref.copy %A, %B :  memref<8x16xf32> to memref<8x16xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_0d
+func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
+  //  CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
+  //       CHECK:   %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
+  //       CHECK:   %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
+  //       CHECK:   %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
+  //       CHECK:   vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
+  memref.copy %A, %B :  memref<f32> to memref<f32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_complex
+// CHECK-NOT: vector<
+func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
+  memref.copy %A, %B :  memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 11bea8d..1304a90 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1307,14 +1307,17 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf
 /// Tests for linalg.pack
 ///----------------------------------------------------------------------------------------
 
-// Input identical as the test in vectorization-with-patterns.mlir. Output is
-// different - vector sizes are inferred (rather than user-specified) and hence
-// masking was used.
+// This packing requires no padding, so no out-of-bounds read/write vector Ops.
 
-// CHECK-LABEL: func @test_vectorize_pack
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir
+// The output is identical (the input vector sizes == the inferred vector
+// sizes, i.e. the tensor sizes).
+
+// CHECK-LABEL: func @pack_no_padding
 // CHECK-SAME:      %[[SRC:.*]]: tensor<32x8x16xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<4x1x32x16x2xf32>
-func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+func.func @pack_no_padding(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
   %pack = linalg.pack %src outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
@@ -1325,9 +1328,9 @@ func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x1
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
-//      CHECK: return %[[write]] : tensor<4x1x32x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<4x1x32x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) {
@@ -1339,14 +1342,18 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// Input identical as the test in vectorization-with-patterns.mlir. Output is
-// different - vector sizes are inferred (rather than user-specified) and hence
-// masking was used.
+// This packing does require padding, so there are out-of-bounds read/write
+// vector Ops.
+
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir.
+// The output is different (the input vector sizes != inferred vector sizes,
+// i.e. the tensor sizes).
 
-// CHECK-LABEL: func @test_vectorize_padded_pack
+// CHECK-LABEL: func @pack_with_padding
 // CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
-func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+func.func @pack_with_padding(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
   %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
@@ -1364,9 +1371,9 @@ func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<3
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -1378,10 +1385,46 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_dynamic_pack
+// This packing does require padding, so there are out-of-bounds read/write
+// vector Ops.
+
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir.
+// The output is identical (in both cases the vector sizes are inferred).
+
+// CHECK-LABEL: func @pack_with_padding_no_vector_sizes
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
+func.func @pack_with_padding_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+  %pad = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  return %pack : tensor<32x4x1x16x2xf32>
+}
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
+// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @pack_with_dynamic_dims
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x16x2xf32>
-func.func @test_vectorize_dynamic_pack(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
+func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
   %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
   return %pack : tensor<?x?x16x2xf32>
 }
@@ -1418,64 +1461,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// -----
-
-// CHECK-LABEL: func @test_vectorize_pack_no_vector_sizes
-// CHECK-SAME:      %[[SRC:.*]]: tensor<64x4xf32>,
-// CHECK-SAME:      %[[DEST:.*]]: tensor<2x4x16x2xf32>
-func.func @test_vectorize_pack_no_vector_sizes(%src: tensor<64x4xf32>, %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
-  %pack = linalg.pack %src outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
-  return %pack : tensor<2x4x16x2xf32>
-}
-//  CHECK-DAG: %[[CST:.*]] = ub.poison : f32
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//      CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]]
-// CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
-//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<64x4xf32> to vector<4x16x2x2xf32>
-//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
-//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
-//      CHECK: return %[[WRITE]] : tensor<2x4x16x2xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
-// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
-// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
-func.func @test_vectorize_padded_pack_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
-  %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
-  return %pack : tensor<32x4x1x16x2xf32>
-}
-//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
-// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
-//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
-}
-
-
 ///----------------------------------------------------------------------------------------
 /// Tests for other Ops
 ///----------------------------------------------------------------------------------------
diff --git a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
index d6c886c..a0c59c0 100644
--- a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
@@ -1,12 +1,14 @@
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target="profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround,dynamic level=none" | FileCheck %s --check-prefix=CHECK-ALL
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target="level=8k" | FileCheck %s --check-prefix=CHECK-LVL-8K
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: mlir-opt %s -split-input-file -tosa-attach-target="specification_version=1.1.draft" | FileCheck %s --check-prefix=CHECK-VERSION-1P1
 
 // -----
 
-// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>}
-// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>}
-// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>}
+// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>}
+// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>}
+// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>}
+// CHECK-VERSION-1P1: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.1.draft", level = "8k", profiles = [], extensions = []>}
 // CHECK-LABEL: test_simple
 func.func @test_simple(%arg0 : tensor<1x1x1x1xf32>, %arg1 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
   %1 = tosa.add %arg0, %arg1 : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir
new file mode 100644
index 0000000..51089df
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.0 profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment"
+
+// -----
+
+func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2>
+  // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}}
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>)  -> tensor<1x14x28xf16>
+  return %0 : tensor<1x14x28xf16>
+}
+
+// -----
+
+func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}}
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>)  -> tensor<1x14x28xf32>
+  return %0 : tensor<1x14x28xf32>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
new file mode 100644
index 0000000..8164509
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" | FileCheck %s
+
+// -----
+
+func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2>
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>)  -> tensor<1x14x28xf16>
+  return %0 : tensor<1x14x28xf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_fp8_input_fp32_acc_type
+func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>)  -> tensor<1x14x28xf32>
+  return %0 : tensor<1x14x28xf32>
+}
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index daae3c7..3718648 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -4896,7 +4896,7 @@ static void emitOpClassDefs(const RecordKeeper &records,
                                                       constraintPrefix);
   os << formatv(opCommentHeader, "Local Utility Method", "Definitions");
   staticVerifierEmitter.collectOpConstraints(defs);
-  staticVerifierEmitter.emitOpConstraints(defs);
+  staticVerifierEmitter.emitOpConstraints();
 
   // Emit the classes.
   emitOpClasses(records, defs, os, staticVerifierEmitter,
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
new file mode 100644
index 0000000..05fcbf7
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
@@ -0,0 +1,28 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+cc_library(
+    name = "lib",
+    srcs = glob(["*.cpp"]),
+    hdrs = glob(["*.h"]),
+    deps = [
+        "//clang:ast",
+        "//clang:ast_matchers",
+        "//clang:ast_matchers_dynamic",
+        "//clang:basic",
+        "//clang:frontend",
+        "//clang:serialization",
+        "//llvm:LineEditor",
+        "//llvm:Support",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
index 2808288..baad2cf 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
@@ -23,6 +23,11 @@ bool_flag(
     build_setting_default = True,
 )
 
+bool_flag(
+    name = "enable_custom_checks",
+    build_setting_default = True,
+)
+
 config_setting(
     name = "static_analyzer_enabled",
     flag_values = {
@@ -30,13 +35,25 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "custom_checks_enabled",
+    flag_values = {
+        ":enable_custom_checks": "true",
+    },
+)
+
 expand_template(
     name = "config",
     out = "clang-tidy-config.h",
     substitutions =
-        {
-            "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 0",
-        } | select({
+        select({
+            ":custom_checks_enabled": {
+                "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 1",
+            },
+            "//conditions:default": {
+                "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 0",
+            },
+        }) | select({
             ":static_analyzer_enabled": {
                 "#cmakedefine01 CLANG_TIDY_ENABLE_STATIC_ANALYZER": "#define CLANG_TIDY_ENABLE_STATIC_ANALYZER 1",
             },
@@ -209,6 +226,15 @@ clang_tidy_library(
 )
 
 clang_tidy_library(
+    name = "custom",
+    deps = [
+        ":lib",
+        "//clang:ast_matchers_dynamic",
+        "//clang-tools-extra/clang-query:lib",
+    ],
+)
+
+clang_tidy_library(
     name = "concurrency",
     deps = [":lib"],
 )
@@ -365,6 +391,9 @@ CHECKS = [
 ] + select({
     ":static_analyzer_enabled": [":mpi"],
     "//conditions:default": [],
+}) + select({
+    ":custom_checks_enabled": [":custom"],
+    "//conditions:default": [],
 })
 
 cc_library(
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 98154e1..ac58e39 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -14225,6 +14225,7 @@ cc_library(
         ":TransformUtils",
         ":VectorDialect",
         ":XeGPUDialect",
+        ":XeGPUUtils",
         ":XeVMDialect",
         "//llvm:Support",
     ],