148 files changed, 4228 insertions, 2623 deletions
diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
index f2bc6f1..4a586c8 100644
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
@@ -39,8 +39,11 @@ static FixItHint generateFixItHint(const ObjCPropertyDecl *Decl,
   auto NewName = Decl->getName().str();
   size_t Index = 0;
   if (Style == CategoryProperty) {
-    Index = Name.find_first_of('_') + 1;
-    NewName.replace(0, Index - 1, Name.substr(0, Index - 1).lower());
+    size_t UnderScorePos = Name.find_first_of('_');
+    if (UnderScorePos != llvm::StringRef::npos) {
+      Index = UnderScorePos + 1;
+      NewName.replace(0, Index - 1, Name.substr(0, Index - 1).lower());
+    }
   }
   if (Index < Name.size()) {
     NewName[Index] = tolower(NewName[Index]);
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9a0d69c..ca7e133 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -357,6 +357,11 @@ Improvements to Clang's diagnostics
   properly being rejected when used at compile-time. It was not implemented
   and caused assertion failures before (#GH158471).
 
+- Closed a loophole in the diagnosis of function pointer conversions changing
+  extended function type information in C mode (#GH41465). Function conversions
+  that were previously incorrectly accepted in case of other irrelevant
+  conditions are now consistently diagnosed, identical to C++ mode.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index 6786b2f..625cc77 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -7082,10 +7082,6 @@ public:
 class SubstTemplateTypeParmPackType : public SubstPackType {
   friend class ASTContext;
 
-  /// A pointer to the set of template arguments that this
-  /// parameter pack is instantiated with.
-  const TemplateArgument *Arguments;
-
   llvm::PointerIntPair<Decl *, 1, bool> AssociatedDeclAndFinal;
 
   SubstTemplateTypeParmPackType(QualType Canon, Decl *AssociatedDecl,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 265462a..37598f8 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10208,15 +10208,12 @@ public:
                                  bool CStyle, bool &ObjCLifetimeConversion);
 
   /// Determine whether the conversion from FromType to ToType is a valid
-  /// conversion that strips "noexcept" or "noreturn" or "cfi_unchecked_callee"
-  /// off the nested function type. This also checks if "cfi_unchecked_callee"
-  /// was added to the function type. If "cfi_unchecked_callee" is added and
-  /// `AddingCFIUncheckedCallee` is provided, it will be set to true. The same
-  /// thing applies for `DiscardingCFIUncheckedCallee` if the attribute is
-  /// discarded.
-  bool IsFunctionConversion(QualType FromType, QualType ToType,
-                            bool *DiscardingCFIUncheckedCallee = nullptr,
-                            bool *AddingCFIUncheckedCallee = nullptr) const;
+  /// conversion of ExtInfo/ExtProtoInfo on the nested function type.
+  /// More precisely, this method checks whether FromType can be transformed
+  /// into an exact match for ToType, by transforming its extended function
+  /// type information in legal manner (e.g. by strictly stripping "noreturn"
+  /// or "noexcept", or by stripping "noescape" for arguments).
+  bool IsFunctionConversion(QualType FromType, QualType ToType) const;
 
   /// Same as `IsFunctionConversion`, but if this would return true, it sets
   /// `ResultTy` to `ToType`.
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index b91cb36..9fe9a13 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -900,10 +900,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
       assert((BT->getKind() != BuiltinType::SveCount || Info.NumVectors == 1) &&
              "Unsupported number of vectors for svcount_t");
 
-      // Debuggers can't extract 1bit from a vector, so will display a
-      // bitpattern for predicates instead.
       unsigned NumElems = Info.EC.getKnownMinValue() * Info.NumVectors;
-      if (Info.ElementType == CGM.getContext().BoolTy) {
+      llvm::Metadata *BitStride = nullptr;
+      if (BT->getKind() == BuiltinType::SveBool) {
+        Info.ElementType = CGM.getContext().UnsignedCharTy;
+        BitStride = llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(
+            llvm::Type::getInt64Ty(CGM.getLLVMContext()), 1));
+      } else if (BT->getKind() == BuiltinType::SveCount) {
         NumElems /= 8;
         Info.ElementType = CGM.getContext().UnsignedCharTy;
       }
@@ -929,7 +932,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
           getOrCreateType(Info.ElementType, TheCU->getFile());
       auto Align = getTypeAlignIfRequired(BT, CGM.getContext());
       return DBuilder.createVectorType(/*Size*/ 0, Align, ElemTy,
-                                       SubscriptArray);
+                                       SubscriptArray, BitStride);
     }
   // It doesn't make sense to generate debug info for PowerPC MMA vector types.
   // So we return a safe type here to avoid generating an error.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 4272d8b..3613b6a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -869,6 +869,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
       CGM.getLangOpts().OpenMPOffloadMandatory,
       /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
       hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
+  Config.setDefaultTargetAS(
+      CGM.getContext().getTargetInfo().getTargetAddressSpace(LangAS::Default));
   OMPBuilder.setConfig(Config);
 
   if (!CGM.getLangOpts().OpenMPIsTargetDevice)
@@ -1243,7 +1245,10 @@ void CGOpenMPRuntimeGPU::emitParallelCall(
     llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
     if (WFn)
       ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
-    llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
+    llvm::Type *FnPtrTy = llvm::PointerType::get(
+        CGF.getLLVMContext(), CGM.getDataLayout().getProgramAddressSpace());
+
+    llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, FnPtrTy);
 
     // Create a private scope that will globalize the arguments
     // passed from the outside of the target region.
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 234683f..d2356eb 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1609,7 +1609,12 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args,
     if (Sanitize.needsFuzzer() && !Args.hasArg(options::OPT_dynamiclib)) {
       AddLinkSanitizerLibArgs(Args, CmdArgs, "fuzzer", /*shared=*/false);
 
-        // Libfuzzer is written in C++ and requires libcxx.
+      // Libfuzzer is written in C++ and requires libcxx.
+      // Since darwin::Linker::ConstructJob already adds -lc++ for clang++
+      // by default if ShouldLinkCXXStdlib(Args), we only add the option if
+      // !ShouldLinkCXXStdlib(Args). This avoids duplicate library errors
+      // on Darwin.
+      if (!ShouldLinkCXXStdlib(Args))
         AddCXXStdlibLibArgs(Args, CmdArgs);
     }
     if (Sanitize.needsStatsRt()) {
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 5f3fbea..c0c8afe 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -168,9 +168,12 @@ void AMDGCN::Linker::constructLinkAndEmitSpirvCommand(
     const InputInfo &Output, const llvm::opt::ArgList &Args) const {
   assert(!Inputs.empty() && "Must have at least one input.");
 
-  constructLlvmLinkCommand(C, JA, Inputs, Output, Args);
+  std::string LinkedBCFilePrefix(
+      Twine(llvm::sys::path::stem(Output.getFilename()), "-linked").str());
+  const char *LinkedBCFilePath = HIP::getTempFile(C, LinkedBCFilePrefix, "bc");
+  InputInfo LinkedBCFile(&JA, LinkedBCFilePath, Output.getBaseInput());
 
-  // Linked BC is now in Output
+  constructLlvmLinkCommand(C, JA, Inputs, LinkedBCFile, Args);
 
   // Emit SPIR-V binary.
   llvm::opt::ArgStringList TrArgs{
@@ -180,7 +183,7 @@ void AMDGCN::Linker::constructLinkAndEmitSpirvCommand(
       "--spirv-lower-const-expr",
       "--spirv-preserve-auxdata",
       "--spirv-debug-info-version=nonsemantic-shader-200"};
-  SPIRV::constructTranslateCommand(C, *this, JA, Output, Output, TrArgs);
+  SPIRV::constructTranslateCommand(C, *this, JA, Output, LinkedBCFile, TrArgs);
 }
 
 // For amdgcn the inputs of the linker job are device bitcode and output is
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index 62bca04..bce7f46 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -22,17 +22,6 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
-// Convenience function for creating temporary file for both modes of
-// isSaveTempsEnabled().
-static const char *getTempFile(Compilation &C, StringRef Prefix,
-                               StringRef Extension) {
-  if (C.getDriver().isSaveTempsEnabled()) {
-    return C.getArgs().MakeArgString(Prefix + "." + Extension);
-  }
-  auto TmpFile = C.getDriver().GetTemporaryPath(Prefix, Extension);
-  return C.addTempFile(C.getArgs().MakeArgString(TmpFile));
-}
-
 // Locates HIP pass plugin.
 static std::string findPassPlugin(const Driver &D,
                                   const llvm::opt::ArgList &Args) {
@@ -65,7 +54,7 @@ void HIPSPV::Linker::constructLinkAndEmitSpirvCommand(
 
   assert(!Inputs.empty() && "Must have at least one input.");
   std::string Name = std::string(llvm::sys::path::stem(Output.getFilename()));
-  const char *TempFile = getTempFile(C, Name + "-link", "bc");
+  const char *TempFile = HIP::getTempFile(C, Name + "-link", "bc");
 
   // Link LLVM bitcode.
   ArgStringList LinkArgs{};
@@ -93,7 +82,7 @@ void HIPSPV::Linker::constructLinkAndEmitSpirvCommand(
   auto PassPluginPath = findPassPlugin(C.getDriver(), Args);
   if (!PassPluginPath.empty()) {
     const char *PassPathCStr = C.getArgs().MakeArgString(PassPluginPath);
-    const char *OptOutput = getTempFile(C, Name + "-lower", "bc");
+    const char *OptOutput = HIP::getTempFile(C, Name + "-lower", "bc");
     ArgStringList OptArgs{TempFile,     "-load-pass-plugin",
                           PassPathCStr, "-passes=hip-post-link-passes",
                           "-o",         OptOutput};
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index cb061ff..732403e 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -472,3 +472,14 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
                                          D.getClangProgramPath(), ClangArgs,
                                          Inputs, Output, D.getPrependArg()));
 }
+
+// Convenience function for creating temporary file for both modes of
+// isSaveTempsEnabled().
+const char *HIP::getTempFile(Compilation &C, StringRef Prefix,
+                             StringRef Extension) {
+  if (C.getDriver().isSaveTempsEnabled()) {
+    return C.getArgs().MakeArgString(Prefix + "." + Extension);
+  }
+  auto TmpFile = C.getDriver().GetTemporaryPath(Prefix, Extension);
+  return C.addTempFile(C.getArgs().MakeArgString(TmpFile));
+}
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.h b/clang/lib/Driver/ToolChains/HIPUtility.h
index 29e5a92..55c155e 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.h
+++ b/clang/lib/Driver/ToolChains/HIPUtility.h
@@ -16,6 +16,8 @@ namespace driver {
 namespace tools {
 namespace HIP {
 
+const char *getTempFile(Compilation &C, StringRef Prefix, StringRef Extension);
+
 // Construct command for creating HIP fatbin.
 void constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
                                StringRef OutputFileName,
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 4d3c7d6..4230ea7 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9014,24 +9014,6 @@ bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType) {
   return FromAttributes != ToAttributes;
 }
 
-// Check if we have a conversion between incompatible cmse function pointer
-// types, that is, a conversion between a function pointer with the
-// cmse_nonsecure_call attribute and one without.
-static bool IsInvalidCmseNSCallConversion(Sema &S, QualType FromType,
-                                          QualType ToType) {
-  if (const auto *ToFn =
-          dyn_cast<FunctionType>(S.Context.getCanonicalType(ToType))) {
-    if (const auto *FromFn =
-            dyn_cast<FunctionType>(S.Context.getCanonicalType(FromType))) {
-      FunctionType::ExtInfo ToEInfo = ToFn->getExtInfo();
-      FunctionType::ExtInfo FromEInfo = FromFn->getExtInfo();
-
-      return ToEInfo.getCmseNSCall() != FromEInfo.getCmseNSCall();
-    }
-  }
-  return false;
-}
-
 // checkPointerTypesForAssignment - This is a very tricky routine (despite
 // being closely modeled after the C99 spec:-). The odd characteristic of this
 // routine is it effectively iqnores the qualifiers on the top level pointee.
@@ -9187,18 +9169,43 @@ static AssignConvertType checkPointerTypesForAssignment(Sema &S,
       return AssignConvertType::IncompatibleFunctionPointer;
     return AssignConvertType::IncompatiblePointer;
   }
-  bool DiscardingCFIUncheckedCallee, AddingCFIUncheckedCallee;
-  if (!S.getLangOpts().CPlusPlus &&
-      S.IsFunctionConversion(ltrans, rtrans, &DiscardingCFIUncheckedCallee,
-                             &AddingCFIUncheckedCallee)) {
-    // Allow conversions between CFIUncheckedCallee-ness.
-    if (!DiscardingCFIUncheckedCallee && !AddingCFIUncheckedCallee)
+  // Note: in C++, typesAreCompatible(ltrans, rtrans) will have guaranteed
+  // hasSameType, so we can skip further checks.
+  const auto *LFT = ltrans->getAs<FunctionType>();
+  const auto *RFT = rtrans->getAs<FunctionType>();
+  if (!S.getLangOpts().CPlusPlus && LFT && RFT) {
+    // The invocation of IsFunctionConversion below will try to transform rtrans
+    // to obtain an exact match for ltrans. This should not fail because of
+    // mismatches in result type and parameter types, they were already checked
+    // by typesAreCompatible above. So we will recreate rtrans (or where
+    // appropriate ltrans) using the result type and parameter types from ltrans
+    // (respectively rtrans), but keeping its ExtInfo/ExtProtoInfo.
+    const auto *LFPT = dyn_cast<FunctionProtoType>(LFT);
+    const auto *RFPT = dyn_cast<FunctionProtoType>(RFT);
+    if (LFPT && RFPT) {
+      rtrans = S.Context.getFunctionType(LFPT->getReturnType(),
+                                         LFPT->getParamTypes(),
+                                         RFPT->getExtProtoInfo());
+    } else if (LFPT) {
+      FunctionProtoType::ExtProtoInfo EPI;
+      EPI.ExtInfo = RFT->getExtInfo();
+      rtrans = S.Context.getFunctionType(LFPT->getReturnType(),
+                                         LFPT->getParamTypes(), EPI);
+    } else if (RFPT) {
+      // In this case, we want to retain rtrans as a FunctionProtoType, to keep
+      // all of its ExtProtoInfo. Transform ltrans instead.
+      FunctionProtoType::ExtProtoInfo EPI;
+      EPI.ExtInfo = LFT->getExtInfo();
+      ltrans = S.Context.getFunctionType(RFPT->getReturnType(),
+                                         RFPT->getParamTypes(), EPI);
+    } else {
+      rtrans = S.Context.getFunctionNoProtoType(LFT->getReturnType(),
+                                                RFT->getExtInfo());
+    }
+    if (!S.Context.hasSameUnqualifiedType(rtrans, ltrans) &&
+        !S.IsFunctionConversion(rtrans, ltrans))
       return AssignConvertType::IncompatibleFunctionPointer;
   }
-  if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans))
-    return AssignConvertType::IncompatibleFunctionPointer;
-  if (S.IsInvalidSMECallConversion(rtrans, ltrans))
-    return AssignConvertType::IncompatibleFunctionPointer;
   return ConvTy;
 }
 
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 8d32ef6..8339bb1 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1892,14 +1892,7 @@ bool Sema::TryFunctionConversion(QualType FromType, QualType ToType,
   return Changed;
 }
 
-bool Sema::IsFunctionConversion(QualType FromType, QualType ToType,
-                                bool *DiscardingCFIUncheckedCallee,
-                                bool *AddingCFIUncheckedCallee) const {
-  if (DiscardingCFIUncheckedCallee)
-    *DiscardingCFIUncheckedCallee = false;
-  if (AddingCFIUncheckedCallee)
-    *AddingCFIUncheckedCallee = false;
-
+bool Sema::IsFunctionConversion(QualType FromType, QualType ToType) const {
   if (Context.hasSameUnqualifiedType(FromType, ToType))
     return false;
 
@@ -1958,25 +1951,14 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType,
   const auto *ToFPT = dyn_cast<FunctionProtoType>(ToFn);
 
   if (FromFPT && ToFPT) {
-    if (FromFPT->hasCFIUncheckedCallee() && !ToFPT->hasCFIUncheckedCallee()) {
-      QualType NewTy = Context.getFunctionType(
-          FromFPT->getReturnType(), FromFPT->getParamTypes(),
-          FromFPT->getExtProtoInfo().withCFIUncheckedCallee(false));
-      FromFPT = cast<FunctionProtoType>(NewTy.getTypePtr());
-      FromFn = FromFPT;
-      Changed = true;
-      if (DiscardingCFIUncheckedCallee)
-        *DiscardingCFIUncheckedCallee = true;
-    } else if (!FromFPT->hasCFIUncheckedCallee() &&
-               ToFPT->hasCFIUncheckedCallee()) {
+    if (FromFPT->hasCFIUncheckedCallee() != ToFPT->hasCFIUncheckedCallee()) {
       QualType NewTy = Context.getFunctionType(
           FromFPT->getReturnType(), FromFPT->getParamTypes(),
-          FromFPT->getExtProtoInfo().withCFIUncheckedCallee(true));
+          FromFPT->getExtProtoInfo().withCFIUncheckedCallee(
+              ToFPT->hasCFIUncheckedCallee()));
       FromFPT = cast<FunctionProtoType>(NewTy.getTypePtr());
       FromFn = FromFPT;
       Changed = true;
-      if (AddingCFIUncheckedCallee)
-        *AddingCFIUncheckedCallee = true;
     }
   }
 
@@ -2007,11 +1989,7 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType,
       Changed = true;
     }
 
-    // For C, when called from checkPointerTypesForAssignment,
-    // we need to not alter FromFn, or else even an innocuous cast
-    // like dropping effects will fail. In C++ however we do want to
-    // alter FromFn (because of the way PerformImplicitConversion works).
-    if (Context.hasAnyFunctionEffects() && getLangOpts().CPlusPlus) {
+    if (Context.hasAnyFunctionEffects()) {
       FromFPT = cast<FunctionProtoType>(FromFn); // in case FromFn changed above
 
       // Transparently add/drop effects; here we are concerned with
diff --git a/clang/lib/StaticAnalyzer/Checkers/VAListChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VAListChecker.cpp
index 79fd0bd..503fa5d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/VAListChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/VAListChecker.cpp
@@ -149,7 +149,7 @@ void VAListChecker::checkPreCall(const CallEvent &Call,
   else if (VaEnd.matches(Call))
     checkVAListEndCall(Call, C);
   else {
-    for (auto FuncInfo : VAListAccepters) {
+    for (const auto &FuncInfo : VAListAccepters) {
       if (!FuncInfo.Func.matches(Call))
         continue;
       const MemRegion *VAList =
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
index e4c93ade..8206e4f 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
@@ -10,30 +10,30 @@
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl(
 // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -46,15 +46,15 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKvl(
@@ -62,15 +62,15 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -83,15 +83,15 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKvl(
@@ -99,15 +99,15 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -120,15 +120,15 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKvl(
@@ -136,15 +136,15 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -157,13 +157,13 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKvl(
@@ -171,13 +171,13 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -189,30 +189,30 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z22test_svld1_ver_hor_za8ju10__SVBool_tPKvl(
 // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -225,15 +225,15 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_ver_vnum_za16ju10__SVBool_tPKvl(
@@ -241,15 +241,15 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -262,15 +262,15 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_ver_vnum_za32ju10__SVBool_tPKvl(
@@ -278,15 +278,15 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -299,15 +299,15 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_ver_vnum_za64ju10__SVBool_tPKvl(
@@ -315,15 +315,15 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -336,13 +336,13 @@ void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z25test_svld1_ver_vnum_za128ju10__SVBool_tPKvl(
@@ -350,13 +350,13 @@ void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c
index 22a0b9e..507a544 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c
@@ -10,30 +10,30 @@
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl(
 // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -46,15 +46,15 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_hor_vnum_za16ju10__SVBool_tPvl(
@@ -62,15 +62,15 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -83,15 +83,15 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_hor_vnum_za32ju10__SVBool_tPvl(
@@ -99,15 +99,15 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -120,15 +120,15 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_hor_vnum_za64ju10__SVBool_tPvl(
@@ -136,15 +136,15 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -157,13 +157,13 @@ void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z25test_svst1_hor_vnum_za128ju10__SVBool_tPvl(
@@ -171,13 +171,13 @@ void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -189,30 +189,30 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z23test_svst1_ver_vnum_za8ju10__SVBool_tPvl(
 // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 15
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -225,15 +225,15 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_ver_vnum_za16ju10__SVBool_tPvl(
@@ -241,15 +241,15 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 7
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -262,15 +262,15 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_ver_vnum_za32ju10__SVBool_tPvl(
@@ -278,15 +278,15 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 3
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -299,15 +299,15 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_ver_vnum_za64ju10__SVBool_tPvl(
@@ -315,15 +315,15 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    [[TMP6:%.*]] = add i32 [[ADD]], 1
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
@@ -336,13 +336,13 @@ void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-C-NEXT:  entry:
 // CHECK-C-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-C-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-C-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-C-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-C-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-C-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-C-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z25test_svst1_ver_vnum_za128ju10__SVBool_tPvl(
@@ -350,13 +350,13 @@ void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:  entry:
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = shl i64 [[VNUM]], 4
-// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
-// CHECK-CXX-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT:    [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT:    [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
+// CHECK-CXX-NEXT:    [[SVL:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-CXX-NEXT:    [[MULVL:%.*]] = mul i64 [[SVL]], [[VNUM]]
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
 // CHECK-CXX-NEXT:    ret void
 //
 void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c
index af39be3..6471ab4 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -390,8 +390,8 @@ svmfloat8x4_t test_svld1_mf8_x4(svcount_t pn, const mfloat8_t *base) ATTR
 // CHECK-LABEL: @test_svld1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -399,8 +399,8 @@ svmfloat8x4_t test_svld1_mf8_x4(svcount_t pn, const mfloat8_t *base) ATTR
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x2u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -413,8 +413,8 @@ svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld1_vnum_u16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -422,8 +422,8 @@ svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnu
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x2u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -436,8 +436,8 @@ svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t
 // CHECK-LABEL: @test_svld1_vnum_u32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -445,8 +445,8 @@ svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x2u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -459,8 +459,8 @@ svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t
 // CHECK-LABEL: @test_svld1_vnum_u64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -468,8 +468,8 @@ svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x2u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -482,8 +482,8 @@ svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t
 // CHECK-LABEL: @test_svld1_vnum_u8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -491,8 +491,8 @@ svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x4u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -505,8 +505,8 @@ svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld1_vnum_u16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -514,8 +514,8 @@ svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnu
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x4u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -528,8 +528,8 @@ svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t
 // CHECK-LABEL: @test_svld1_vnum_u32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -537,8 +537,8 @@ svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x4u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -551,8 +551,8 @@ svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t
 // CHECK-LABEL: @test_svld1_vnum_u64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -560,8 +560,8 @@ svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x4u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -574,8 +574,8 @@ svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t
 // CHECK-LABEL: @test_svld1_vnum_s8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -583,8 +583,8 @@ svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x2u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -597,8 +597,8 @@ svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_s16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -606,8 +606,8 @@ svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x2u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -620,8 +620,8 @@ svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vn
 // CHECK-LABEL: @test_svld1_vnum_s32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -629,8 +629,8 @@ svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vn
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x2u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -643,8 +643,8 @@ svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vn
 // CHECK-LABEL: @test_svld1_vnum_s64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -652,8 +652,8 @@ svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vn
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x2u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -666,8 +666,8 @@ svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vn
 // CHECK-LABEL: @test_svld1_vnum_s8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -675,8 +675,8 @@ svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vn
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x4u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -689,8 +689,8 @@ svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_s16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -698,8 +698,8 @@ svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x4u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -712,8 +712,8 @@ svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vn
 // CHECK-LABEL: @test_svld1_vnum_s32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -721,8 +721,8 @@ svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vn
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x4u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -735,8 +735,8 @@ svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vn
 // CHECK-LABEL: @test_svld1_vnum_s64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -744,8 +744,8 @@ svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vn
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x4u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -758,8 +758,8 @@ svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vn
 // CHECK-LABEL: @test_svld1_vnum_f16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -767,8 +767,8 @@ svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vn
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x2u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -781,8 +781,8 @@ svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_
 // CHECK-LABEL: @test_svld1_vnum_f32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -790,8 +790,8 @@ svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x2u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -804,8 +804,8 @@ svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_
 // CHECK-LABEL: @test_svld1_vnum_f64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -813,8 +813,8 @@ svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x2u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -827,8 +827,8 @@ svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_
 // CHECK-LABEL: @test_svld1_vnum_mf8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -836,8 +836,8 @@ svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_mf8_x2u11__SVCount_tPKu6__mfp8l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -850,8 +850,8 @@ svmfloat8x2_t test_svld1_vnum_mf8_x2(svcount_t pn, const mfloat8_t *base, int64_
 // CHECK-LABEL: @test_svld1_vnum_f16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -859,8 +859,8 @@ svmfloat8x2_t test_svld1_vnum_mf8_x2(svcount_t pn, const mfloat8_t *base, int64_
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x4u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -873,8 +873,8 @@ svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_
 // CHECK-LABEL: @test_svld1_vnum_f32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -882,8 +882,8 @@ svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x4u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -896,8 +896,8 @@ svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_
 // CHECK-LABEL: @test_svld1_vnum_f64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -905,8 +905,8 @@ svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x4u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -919,8 +919,8 @@ svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_
 // CHECK-LABEL: @test_svld1_vnum_mf8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -928,8 +928,8 @@ svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_mf8_x4u11__SVCount_tPKu6__mfp8l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c
index 02c7586..cd92b61 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c
@@ -388,8 +388,8 @@ svmfloat8x4_t test_svldnt1_mf8_x4(svcount_t pn, const mfloat8_t *base) ATTR
 // CHECK-LABEL: @test_svldnt1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -397,8 +397,8 @@ svmfloat8x4_t test_svldnt1_mf8_x4(svcount_t pn, const mfloat8_t *base) ATTR
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x2u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -411,8 +411,8 @@ svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t v
 // CHECK-LABEL: @test_svldnt1_vnum_u16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -420,8 +420,8 @@ svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t v
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x2u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -434,8 +434,8 @@ svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_
 // CHECK-LABEL: @test_svldnt1_vnum_u32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -443,8 +443,8 @@ svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x2u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -457,8 +457,8 @@ svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_
 // CHECK-LABEL: @test_svldnt1_vnum_u64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -466,8 +466,8 @@ svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x2u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -480,8 +480,8 @@ svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_
 // CHECK-LABEL: @test_svldnt1_vnum_u8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -489,8 +489,8 @@ svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x4u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -503,8 +503,8 @@ svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t v
 // CHECK-LABEL: @test_svldnt1_vnum_u16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -512,8 +512,8 @@ svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t v
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x4u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -526,8 +526,8 @@ svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_
 // CHECK-LABEL: @test_svldnt1_vnum_u32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -535,8 +535,8 @@ svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x4u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -549,8 +549,8 @@ svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_
 // CHECK-LABEL: @test_svldnt1_vnum_u64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -558,8 +558,8 @@ svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x4u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -572,8 +572,8 @@ svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_
 // CHECK-LABEL: @test_svldnt1_vnum_s8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -581,8 +581,8 @@ svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x2u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -595,8 +595,8 @@ svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnu
 // CHECK-LABEL: @test_svldnt1_vnum_s16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -604,8 +604,8 @@ svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnu
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x2u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -618,8 +618,8 @@ svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t
 // CHECK-LABEL: @test_svldnt1_vnum_s32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -627,8 +627,8 @@ svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x2u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -641,8 +641,8 @@ svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t
 // CHECK-LABEL: @test_svldnt1_vnum_s64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -650,8 +650,8 @@ svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x2u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -664,8 +664,8 @@ svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t
 // CHECK-LABEL: @test_svldnt1_vnum_s8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -673,8 +673,8 @@ svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x4u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -687,8 +687,8 @@ svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnu
 // CHECK-LABEL: @test_svldnt1_vnum_s16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -696,8 +696,8 @@ svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnu
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x4u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
@@ -710,8 +710,8 @@ svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t
 // CHECK-LABEL: @test_svldnt1_vnum_s32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -719,8 +719,8 @@ svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x4u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
@@ -733,8 +733,8 @@ svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t
 // CHECK-LABEL: @test_svldnt1_vnum_s64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -742,8 +742,8 @@ svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x4u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
@@ -756,8 +756,8 @@ svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t
 // CHECK-LABEL: @test_svldnt1_vnum_f16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -765,8 +765,8 @@ svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x2u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -779,8 +779,8 @@ svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int6
 // CHECK-LABEL: @test_svldnt1_vnum_f32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -788,8 +788,8 @@ svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int6
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x2u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -802,8 +802,8 @@ svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int6
 // CHECK-LABEL: @test_svldnt1_vnum_f64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -811,8 +811,8 @@ svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int6
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x2u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -825,8 +825,8 @@ svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int6
 // CHECK-LABEL: @test_svldnt1_vnum_mf8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -834,8 +834,8 @@ svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int6
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_mf8_x2u11__SVCount_tPKu6__mfp8l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -848,8 +848,8 @@ svmfloat8x2_t test_svldnt1_vnum_mf8_x2(svcount_t pn, const mfloat8_t *base, int6
 // CHECK-LABEL: @test_svldnt1_vnum_f16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -857,8 +857,8 @@ svmfloat8x2_t test_svldnt1_vnum_mf8_x2(svcount_t pn, const mfloat8_t *base, int6
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x4u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
@@ -871,8 +871,8 @@ svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int6
 // CHECK-LABEL: @test_svldnt1_vnum_f32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -880,8 +880,8 @@ svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int6
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x4u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
@@ -894,8 +894,8 @@ svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int6
 // CHECK-LABEL: @test_svldnt1_vnum_f64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -903,8 +903,8 @@ svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int6
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x4u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
@@ -917,8 +917,8 @@ svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int6
 // CHECK-LABEL: @test_svldnt1_vnum_mf8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
@@ -926,8 +926,8 @@ svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int6
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_mf8_x4u11__SVCount_tPKu6__mfp8l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c
index 092f31b..9920aba 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c
@@ -388,8 +388,8 @@ void test_svst1_mf8_x4(svcount_t pn, mfloat8_t *base, svmfloat8x4_t v) ATTR
 // CHECK-LABEL: @test_svst1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -397,8 +397,8 @@ void test_svst1_mf8_x4(svcount_t pn, mfloat8_t *base, svmfloat8x4_t v) ATTR
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -411,8 +411,8 @@ void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_
 // CHECK-LABEL: @test_svst1_vnum_u16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -420,8 +420,8 @@ void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -434,8 +434,8 @@ void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CHECK-LABEL: @test_svst1_vnum_u32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -443,8 +443,8 @@ void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -457,8 +457,8 @@ void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CHECK-LABEL: @test_svst1_vnum_u64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -466,8 +466,8 @@ void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -480,8 +480,8 @@ void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CHECK-LABEL: @test_svst1_vnum_u8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -489,8 +489,8 @@ void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -503,8 +503,8 @@ void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_
 // CHECK-LABEL: @test_svst1_vnum_u16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -512,8 +512,8 @@ void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -526,8 +526,8 @@ void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CHECK-LABEL: @test_svst1_vnum_u32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -535,8 +535,8 @@ void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -549,8 +549,8 @@ void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CHECK-LABEL: @test_svst1_vnum_u64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -558,8 +558,8 @@ void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -572,8 +572,8 @@ void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CHECK-LABEL: @test_svst1_vnum_s8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -581,8 +581,8 @@ void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -595,8 +595,8 @@ void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t
 // CHECK-LABEL: @test_svst1_vnum_s16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -604,8 +604,8 @@ void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -618,8 +618,8 @@ void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2
 // CHECK-LABEL: @test_svst1_vnum_s32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -627,8 +627,8 @@ void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -641,8 +641,8 @@ void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2
 // CHECK-LABEL: @test_svst1_vnum_s64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -650,8 +650,8 @@ void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -664,8 +664,8 @@ void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2
 // CHECK-LABEL: @test_svst1_vnum_s8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -673,8 +673,8 @@ void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -687,8 +687,8 @@ void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t
 // CHECK-LABEL: @test_svst1_vnum_s16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -696,8 +696,8 @@ void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -710,8 +710,8 @@ void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4
 // CHECK-LABEL: @test_svst1_vnum_s32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -719,8 +719,8 @@ void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -733,8 +733,8 @@ void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4
 // CHECK-LABEL: @test_svst1_vnum_s64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -742,8 +742,8 @@ void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -757,8 +757,8 @@ void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -767,8 +767,8 @@ void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -782,8 +782,8 @@ void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -792,8 +792,8 @@ void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -807,8 +807,8 @@ void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -817,8 +817,8 @@ void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -831,8 +831,8 @@ void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CHECK-LABEL: @test_svst1_vnum_mf8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -840,8 +840,8 @@ void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_mf8_x2u11__SVCount_tPu6__mfp8l13svmfloat8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -855,8 +855,8 @@ void test_svst1_vnum_mf8_x2(svcount_t pn, mfloat8_t *base, int64_t vnum, svmfloa
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], <vscale x 8 x half> [[V_COERCE2:%.*]], <vscale x 8 x half> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -865,8 +865,8 @@ void test_svst1_vnum_mf8_x2(svcount_t pn, mfloat8_t *base, int64_t vnum, svmfloa
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], <vscale x 8 x half> [[V_COERCE2:%.*]], <vscale x 8 x half> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -880,8 +880,8 @@ void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], <vscale x 4 x float> [[V_COERCE2:%.*]], <vscale x 4 x float> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -890,8 +890,8 @@ void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], <vscale x 4 x float> [[V_COERCE2:%.*]], <vscale x 4 x float> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -905,8 +905,8 @@ void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], <vscale x 2 x double> [[V_COERCE2:%.*]], <vscale x 2 x double> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -915,8 +915,8 @@ void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], <vscale x 2 x double> [[V_COERCE2:%.*]], <vscale x 2 x double> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -929,8 +929,8 @@ void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CHECK-LABEL: @test_svst1_vnum_mf8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -938,8 +938,8 @@ void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_mf8_x4u11__SVCount_tPu6__mfp8l13svmfloat8x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c
index 99dff2c0..90045b0 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -409,8 +409,8 @@ void test_svstnt1_mf8_x4(svcount_t pn, mfloat8_t *base, svmfloat8x4_t v) ATTR
 // CHECK-LABEL: @test_svstnt1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -418,8 +418,8 @@ void test_svstnt1_mf8_x4(svcount_t pn, mfloat8_t *base, svmfloat8x4_t v) ATTR
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -433,8 +433,8 @@ void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CHECK-LABEL: @test_svstnt1_vnum_u16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -442,8 +442,8 @@ void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -457,8 +457,8 @@ void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CHECK-LABEL: @test_svstnt1_vnum_u32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -466,8 +466,8 @@ void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -481,8 +481,8 @@ void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CHECK-LABEL: @test_svstnt1_vnum_u64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -490,8 +490,8 @@ void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -505,8 +505,8 @@ void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CHECK-LABEL: @test_svstnt1_vnum_u8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -514,8 +514,8 @@ void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -529,8 +529,8 @@ void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CHECK-LABEL: @test_svstnt1_vnum_u16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -538,8 +538,8 @@ void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -553,8 +553,8 @@ void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CHECK-LABEL: @test_svstnt1_vnum_u32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -562,8 +562,8 @@ void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -577,8 +577,8 @@ void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CHECK-LABEL: @test_svstnt1_vnum_u64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -586,8 +586,8 @@ void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -601,8 +601,8 @@ void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CHECK-LABEL: @test_svstnt1_vnum_s8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -610,8 +610,8 @@ void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -625,8 +625,8 @@ void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_
 // CHECK-LABEL: @test_svstnt1_vnum_s16_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -634,8 +634,8 @@ void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -649,8 +649,8 @@ void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CHECK-LABEL: @test_svstnt1_vnum_s32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -658,8 +658,8 @@ void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -673,8 +673,8 @@ void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CHECK-LABEL: @test_svstnt1_vnum_s64_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -682,8 +682,8 @@ void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -697,8 +697,8 @@ void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CHECK-LABEL: @test_svstnt1_vnum_s8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -706,8 +706,8 @@ void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -721,8 +721,8 @@ void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_
 // CHECK-LABEL: @test_svstnt1_vnum_s16_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -730,8 +730,8 @@ void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[V_COERCE0:%.*]], <vscale x 8 x i16> [[V_COERCE1:%.*]], <vscale x 8 x i16> [[V_COERCE2:%.*]], <vscale x 8 x i16> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -745,8 +745,8 @@ void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CHECK-LABEL: @test_svstnt1_vnum_s32_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -754,8 +754,8 @@ void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[V_COERCE0:%.*]], <vscale x 4 x i32> [[V_COERCE1:%.*]], <vscale x 4 x i32> [[V_COERCE2:%.*]], <vscale x 4 x i32> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -769,8 +769,8 @@ void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CHECK-LABEL: @test_svstnt1_vnum_s64_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -778,8 +778,8 @@ void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[V_COERCE0:%.*]], <vscale x 2 x i64> [[V_COERCE1:%.*]], <vscale x 2 x i64> [[V_COERCE2:%.*]], <vscale x 2 x i64> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -794,8 +794,8 @@ void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -804,8 +804,8 @@ void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -820,8 +820,8 @@ void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -830,8 +830,8 @@ void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -846,8 +846,8 @@ void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -856,8 +856,8 @@ void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -870,8 +870,8 @@ void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CHECK-LABEL: @test_svstnt1_vnum_mf8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -879,8 +879,8 @@ void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_mf8_x2u11__SVCount_tPu6__mfp8l13svmfloat8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -894,8 +894,8 @@ void test_svstnt1_vnum_mf8_x2(svcount_t pn, mfloat8_t *base, int64_t vnum, svmfl
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], <vscale x 8 x half> [[V_COERCE2:%.*]], <vscale x 8 x half> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -904,8 +904,8 @@ void test_svstnt1_vnum_mf8_x2(svcount_t pn, mfloat8_t *base, int64_t vnum, svmfl
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[V_COERCE0:%.*]], <vscale x 8 x half> [[V_COERCE1:%.*]], <vscale x 8 x half> [[V_COERCE2:%.*]], <vscale x 8 x half> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -920,8 +920,8 @@ void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], <vscale x 4 x float> [[V_COERCE2:%.*]], <vscale x 4 x float> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -930,8 +930,8 @@ void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[V_COERCE0:%.*]], <vscale x 4 x float> [[V_COERCE1:%.*]], <vscale x 4 x float> [[V_COERCE2:%.*]], <vscale x 4 x float> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -946,8 +946,8 @@ void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], <vscale x 2 x double> [[V_COERCE2:%.*]], <vscale x 2 x double> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -956,8 +956,8 @@ void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[CONV]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[CONV]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[V_COERCE0:%.*]], <vscale x 2 x double> [[V_COERCE1:%.*]], <vscale x 2 x double> [[V_COERCE2:%.*]], <vscale x 2 x double> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
@@ -970,8 +970,8 @@ void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CHECK-LABEL: @test_svstnt1_vnum_mf8_x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
@@ -979,8 +979,8 @@ void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_mf8_x4u11__SVCount_tPu6__mfp8l13svmfloat8x4_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[V_COERCE0:%.*]], <vscale x 16 x i8> [[V_COERCE1:%.*]], <vscale x 16 x i8> [[V_COERCE2:%.*]], <vscale x 16 x i8> [[V_COERCE3:%.*]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
diff --git a/clang/test/DebugInfo/AArch64/sve-vector-types.c b/clang/test/DebugInfo/AArch64/sve-vector-types.c
index ca592b1..ed1dbd1 100644
--- a/clang/test/DebugInfo/AArch64/sve-vector-types.c
+++ b/clang/test/DebugInfo/AArch64/sve-vector-types.c
@@ -3,10 +3,10 @@
 
 void test_locals(void) {
   // CHECK-DAG: name: "__SVBool_t",{{.*}}, baseType: ![[CT1:[0-9]+]]
-  // CHECK-DAG: ![[CT1]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYU8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64:[0-9]+]])
+  // CHECK-DAG: ![[CT1]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYU8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8:[0-9]+]], bitStride: i64 1)
   // CHECK-DAG: ![[ELTTYU8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
-  // CHECK-DAG: ![[ELTS1_64]] = !{![[REALELTS1_64:[0-9]+]]}
-  // CHECK-DAG: ![[REALELTS1_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 1, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  // CHECK-DAG: ![[ELTS8]] = !{![[REALELTS8:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS8]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
   __SVBool_t b8;
 
   // CHECK-DAG: name: "__SVCount_t",{{.*}}, baseType: ![[CT1_2:[0-9]+]]
@@ -18,8 +18,6 @@ void test_locals(void) {
   // CHECK-DAG: name: "__SVInt8_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
   // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYS8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8:[0-9]+]])
   // CHECK-DAG: ![[ELTTYS8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
-  // CHECK-DAG: ![[ELTS8]] = !{![[REALELTS8:[0-9]+]]}
-  // CHECK-DAG: ![[REALELTS8]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
   __SVInt8_t s8;
 
   // CHECK-DAG: name: "__SVUint8_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
@@ -51,12 +49,14 @@ void test_locals(void) {
   __SVUint32_t u32;
 
   // CHECK-DAG: name: "__SVInt64_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
-  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]])
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS64:[0-9]+]])
   // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long", size: 64, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS64]] = !{![[REALELTS64:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 1, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
   __SVInt64_t s64;
 
   // CHECK-DAG: name: "__SVUint64_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
-  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]])
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS64]])
   // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "unsigned long", size: 64, encoding: DW_ATE_unsigned)
   __SVUint64_t u64;
 
@@ -71,7 +71,7 @@ void test_locals(void) {
   __SVFloat32_t f32;
 
   // CHECK:     name: "__SVFloat64_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
-  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]])
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS64]])
   // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
   __SVFloat64_t f64;
 }
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index a94299e..2f40fd4 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -207,7 +207,7 @@
 //
 
 // AMDGCNSPIRV: "-cc1" "-triple" "spirv64-amd-amdhsa" {{.*}}"-emit-llvm-bc" {{.*}}"-fembed-bitcode=marker" "-disable-llvm-passes" {{.*}} "-o" "[[AMDGCNSPV_BC:.*bc]]"
-// AMDGCNSPIRV: {{".*llvm-link.*"}} "-o" "[[AMDGCNSPV_TMP:.*out]]" "[[AMDGCNSPV_BC]]"
+// AMDGCNSPIRV: {{".*llvm-link.*"}} "-o" "[[AMDGCNSPV_TMP:.*bc]]" "[[AMDGCNSPV_BC]]"
 // AMDGCNSPIRV: {{".*llvm-spirv.*"}} "--spirv-max-version=1.6" "--spirv-ext=+all" {{.*}} "[[AMDGCNSPV_TMP]]" {{.*}}"-o" "[[AMDGCNSPV_CO:.*out]]"
 // AMDGCNSPIRV: "-cc1" "-triple" "amdgcn-amd-amdhsa" {{.*}}"-emit-obj" {{.*}}"-target-cpu" "gfx900"{{.*}} "-o" "[[GFX900_OBJ:.*o]]"
 // AMDGCNSPIRV: {{".*lld.*"}} {{.*}}"-plugin-opt=mcpu=gfx900" {{.*}} "-o" "[[GFX900_CO:.*out]]" {{.*}}"[[GFX900_OBJ]]"
diff --git a/clang/test/Driver/spirv-amd-toolchain.c b/clang/test/Driver/spirv-amd-toolchain.c
index 14ba8f4..8f1f0f3 100644
--- a/clang/test/Driver/spirv-amd-toolchain.c
+++ b/clang/test/Driver/spirv-amd-toolchain.c
@@ -15,5 +15,5 @@
 // RUN: %clang -### --target=spirv64-amd-amdhsa %s -nogpulib -nogpuinc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=INVOCATION
 // INVOCATION: "-cc1" "-triple" "spirv64-amd-amdhsa" {{.*}}"-disable-llvm-optzns" {{.*}} "-o" "[[OUTPUT:.+]]" "-x" "c"
-// INVOCATION: "{{.*}}llvm-link" "-o" "a.out" "[[OUTPUT]]"
-// INVOCATION: "{{.*}}llvm-spirv" "--spirv-max-version=1.6" "--spirv-ext=+all" "--spirv-allow-unknown-intrinsics" "--spirv-lower-const-expr" "--spirv-preserve-auxdata" "--spirv-debug-info-version=nonsemantic-shader-200" "a.out" "-o" "a.out"
+// INVOCATION: "{{.*}}llvm-link" "-o" "[[LINKED_OUTPUT:.+]]" "[[OUTPUT]]"
+// INVOCATION: "{{.*}}llvm-spirv" "--spirv-max-version=1.6" "--spirv-ext=+all" "--spirv-allow-unknown-intrinsics" "--spirv-lower-const-expr" "--spirv-preserve-auxdata" "--spirv-debug-info-version=nonsemantic-shader-200" "[[LINKED_OUTPUT]]" "-o" "a.out"
diff --git a/clang/test/OpenMP/reduction_complex.c b/clang/test/OpenMP/reduction_complex.c
index e00caa8..b79903ff 100644
--- a/clang/test/OpenMP/reduction_complex.c
+++ b/clang/test/OpenMP/reduction_complex.c
@@ -10,6 +10,17 @@
 // RUN:  -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
 // RUN:  -o - | FileCheck %s --check-prefix CHECK
 
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN:  -triple powerpc64le-unknown-unknown \
+// RUN:  -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o \
+// RUN:  %t-ppc-host-spv.bc
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN:  -triple spirv64-intel -DCUA \
+// RUN:  -fopenmp-targets=spirv64-intel -emit-llvm %s \
+// RUN:  -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host-spv.bc \
+// RUN:  -o - | FileCheck %s --check-prefix CHECK
+
 // expected-no-diagnostics
 int foo() {
   int i;
@@ -46,9 +57,9 @@ int foo() {
 // CHECK-NEXT:         %[[VAL_244:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_232]], i64 0, i64 0
 // CHECK-NEXT:         %[[VAL_245:.*]] = getelementptr { float, float }, ptr %[[VAL_243]], i64 1
 // CHECK-NEXT:         %[[VAL_246:.*]] = load i64, ptr %[[VAL_243]], align 8
-// CHECK-NEXT:         %[[VAL_247:.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-NEXT:         %[[VAL_247:.*]] = call{{.*}}i32 @__kmpc_get_warp_size()
 // CHECK-NEXT:         %[[VAL_248:.*]] = trunc i32 %[[VAL_247]] to i16
-// CHECK-NEXT:         %[[VAL_249:.*]] = call i64 @__kmpc_shuffle_int64(i64 %[[VAL_246]], i16 %[[VAL_240]], i16 %[[VAL_248]])
+// CHECK-NEXT:         %[[VAL_249:.*]] = call{{.*}}i64 @__kmpc_shuffle_int64(i64 %[[VAL_246]], i16 %[[VAL_240]], i16 %[[VAL_248]])
 // CHECK-NEXT:         store i64 %[[VAL_249]], ptr %[[VAL_233]], align 8
 // CHECK-NEXT:         %[[VAL_250:.*]] = getelementptr i64, ptr %[[VAL_243]], i64 1
 // CHECK-NEXT:         %[[VAL_251:.*]] = getelementptr i64, ptr %[[VAL_233]], i64 1
@@ -67,7 +78,7 @@ int foo() {
 // CHECK-NEXT:         %[[VAL_263:.*]] = or i1 %[[VAL_262]], %[[VAL_261]]
 // CHECK-NEXT:         br i1 %[[VAL_263]], label %[[VAL_264:.*]], label %[[VAL_265:.*]]
 // CHECK:       then:                                             ; preds = %[[VAL_266:.*]]
-// CHECK-NEXT:         call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l{{[0-9]+}}_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr %[[VAL_238]], ptr %[[VAL_232]]) #2
+// CHECK-NEXT:         call{{.*}}void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l{{[0-9]+}}_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr %[[VAL_238]], ptr %[[VAL_232]]) #2
 // CHECK-NEXT:         br label %[[VAL_267:.*]]
 // CHECK:       else:                                             ; preds = %[[VAL_266]]
 // CHECK-NEXT:         br label %[[VAL_267]]
diff --git a/clang/test/Sema/incompatible-function-pointer-types-extinfo.c b/clang/test/Sema/incompatible-function-pointer-types-extinfo.c
new file mode 100644
index 0000000..b4de356
--- /dev/null
+++ b/clang/test/Sema/incompatible-function-pointer-types-extinfo.c
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -fsyntax-only %s -std=c99 -verify=expected
+// RUN: %clang_cc1 -fsyntax-only %s -std=c11 -verify=expected
+// RUN: %clang_cc1 -fsyntax-only %s -std=c23 -verify=expected,proto
+
+// This verifies that -Wincompatible-function-pointer-type diagnostics for
+// extended function type information are consistent, also in case of other
+// allowed funcion type difference in C.
+//
+// Test case adapted from issue #41465, with suggestions from PR #160477.
+
+enum E { A = -1, B };
+
+// Case 1: assignment adding noreturn
+
+int      f1   (int);
+int    (*fp1a)(int) __attribute__((noreturn)) = &f1; // expected-error {{incompatible function pointer types}}
+enum E (*fp1b)(int) __attribute__((noreturn)) = &f1; // expected-error {{incompatible function pointer types}}
+int    (*fp1c)()    __attribute__((noreturn)) = &f1; // expected-error {{incompatible function pointer types}}
+
+// Case 2: assignment adding noescape on arg
+
+int   f2   (int*                          ) __attribute__((noreturn));
+int (*fp2a)(int* __attribute__((noescape))) __attribute__((noreturn)) = &f2; // expected-error {{incompatible function pointer types}}
+int (*fp2b)(int* __attribute__((noescape)))                           = &f2; // expected-error {{incompatible function pointer types}}
+
+// Case 3: assignment adding cfi_unchecked_callee
+
+int   f3   (int*                          );
+int (*fp3a)(int*                          ) __attribute__((noreturn                     )) = &f3; // expected-error {{incompatible function pointer types}}
+int (*fp3b)(int* __attribute__((noescape)))                                                = &f3; // expected-error {{incompatible function pointer types}}
+int (*fp3c)(int*                          ) __attribute__((noreturn,cfi_unchecked_callee)) = &f3; // expected-error {{incompatible function pointer types}}
+int (*fp3d)(int* __attribute__((noescape))) __attribute__((         cfi_unchecked_callee)) = &f3; // expected-error {{incompatible function pointer types}}
+int (*fp3e)(int* __attribute__((noescape))) __attribute__((noreturn,cfi_unchecked_callee)) = &f3; // expected-error {{incompatible function pointer types}}
+
+// Case 4: assignment converting between prototype/no-prototype
+
+void p1(void);
+void i1(int );
+void n1(    );
+void p2(void) __attribute__((noreturn));
+void i2(int ) __attribute__((noreturn));
+void n2(    ) __attribute__((noreturn));
+
+void (*t1)() = p1;
+void (*t2)() = i1; // proto-error {{incompatible function pointer types}}
+void (*t3)() = n1;
+void (*t4)() __attribute__((noreturn)) = p1; // expected-error {{incompatible function pointer types}}
+void (*t5)() __attribute__((noreturn)) = i1; // expected-error {{incompatible function pointer types}}
+void (*t6)() __attribute__((noreturn)) = n1; // expected-error {{incompatible function pointer types}}
+
+void (*t7)() = p2;
+void (*t8)() = i2; // proto-error {{incompatible function pointer types}}
+void (*t9)() = n2;
+void (*tA)() __attribute__((noreturn)) = p2;
+void (*tB)() __attribute__((noreturn)) = i2; // proto-error {{incompatible function pointer types}}
+void (*tC)() __attribute__((noreturn)) = n2;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index 721c39d..0040f79 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -105,6 +105,10 @@ extern "C" {
     mach_msg_type_number_t *infoCnt);
 }
 
+// Weak symbol no-op when TSan is not linked
+SANITIZER_WEAK_ATTRIBUTE extern void __tsan_set_in_internal_write_call(
+    bool value) {}
+
 namespace __sanitizer {
 
 #include "sanitizer_syscall_generic.inc"
@@ -175,7 +179,11 @@ uptr internal_read(fd_t fd, void *buf, uptr count) {
 }
 
 uptr internal_write(fd_t fd, const void *buf, uptr count) {
-  return write(fd, buf, count);
+  // We need to disable interceptors when writing in TSan
+  __tsan_set_in_internal_write_call(true);
+  uptr res = write(fd, buf, count);
+  __tsan_set_in_internal_write_call(false);
+  return res;
 }
 
 uptr internal_stat(const char *path, void *buf) {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_flags.cpp b/compiler-rt/lib/tsan/rtl/tsan_flags.cpp
index 3fd58f4..50632d2 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_flags.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_flags.cpp
@@ -20,6 +20,43 @@
 #include "tsan_rtl.h"
 #include "ubsan/ubsan_flags.h"
 
+#if SANITIZER_APPLE
+namespace __sanitizer {
+
+template <>
+inline bool FlagHandler<LockDuringWriteSetting>::Parse(const char *value) {
+  if (internal_strcmp(value, "on") == 0) {
+    *t_ = kLockDuringAllWrites;
+    return true;
+  }
+  if (internal_strcmp(value, "disable_for_current_process") == 0) {
+    *t_ = kNoLockDuringWritesCurrentProcess;
+    return true;
+  }
+  if (internal_strcmp(value, "disable_for_all_processes") == 0) {
+    *t_ = kNoLockDuringWritesAllProcesses;
+    return true;
+  }
+  Printf("ERROR: Invalid value for signal handler option: '%s'\n", value);
+  return false;
+}
+
+template <>
+inline bool FlagHandler<LockDuringWriteSetting>::Format(char *buffer,
+                                                        uptr size) {
+  switch (*t_) {
+    case kLockDuringAllWrites:
+      return FormatString(buffer, size, "on");
+    case kNoLockDuringWritesCurrentProcess:
+      return FormatString(buffer, size, "disable_for_current_process");
+    case kNoLockDuringWritesAllProcesses:
+      return FormatString(buffer, size, "disable_for_all_processes");
+  }
+}
+
+}  // namespace __sanitizer
+#endif
+
 namespace __tsan {
 
 // Can be overriden in frontend.
diff --git a/compiler-rt/lib/tsan/rtl/tsan_flags.h b/compiler-rt/lib/tsan/rtl/tsan_flags.h
index da27d5b..477d08d 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_flags.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_flags.h
@@ -16,6 +16,14 @@
 #include "sanitizer_common/sanitizer_flags.h"
 #include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
 
+#if SANITIZER_APPLE
+enum LockDuringWriteSetting {
+  kLockDuringAllWrites,
+  kNoLockDuringWritesCurrentProcess,
+  kNoLockDuringWritesAllProcesses,
+};
+#endif
+
 namespace __tsan {
 
 struct Flags : DDFlags {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_flags.inc b/compiler-rt/lib/tsan/rtl/tsan_flags.inc
index 731d776..64cc091 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_flags.inc
+++ b/compiler-rt/lib/tsan/rtl/tsan_flags.inc
@@ -80,3 +80,15 @@ TSAN_FLAG(bool, shared_ptr_interceptor, true,
 TSAN_FLAG(bool, print_full_thread_history, false,
           "If set, prints thread creation stacks for the threads involved in "
           "the report and their ancestors up to the main thread.")
+
+#if SANITIZER_APPLE
+TSAN_FLAG(LockDuringWriteSetting, lock_during_write, kLockDuringAllWrites,
+          "Determines whether to obtain a lock while writing logs or error "
+          "reports. "
+          "\"on\" - [default] lock during all writes. "
+          "\"disable_for_current_process\" - don't lock during all writes in "
+          "the current process, but do lock for all writes in child "
+          "processes."
+          "\"disable_for_all_processes\" - don't lock during all writes in "
+          "the current process and it's children processes.")
+#endif
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h
index a357a87..d4b65ab 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h
@@ -1,6 +1,9 @@
 #ifndef TSAN_INTERCEPTORS_H
 #define TSAN_INTERCEPTORS_H
 
+#if SANITIZER_APPLE
+#  include "sanitizer_common/sanitizer_mac.h"
+#endif
 #include "sanitizer_common/sanitizer_stacktrace.h"
 #include "tsan_rtl.h"
 
@@ -43,7 +46,12 @@ inline bool in_symbolizer() {
 #endif
 
 inline bool MustIgnoreInterceptor(ThreadState *thr) {
-  return !thr->is_inited || thr->ignore_interceptors || thr->in_ignored_lib;
+  return !thr->is_inited || thr->ignore_interceptors || thr->in_ignored_lib
+#if SANITIZER_APPLE
+         || (flags()->lock_during_write != kLockDuringAllWrites &&
+             thr->in_internal_write_call)
+#endif
+      ;
 }
 
 }  // namespace __tsan
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index 37c69b1..0c35804 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -31,6 +31,9 @@
 #include "sanitizer_common/sanitizer_tls_get_addr.h"
 #include "sanitizer_common/sanitizer_vector.h"
 #include "tsan_fd.h"
+#if SANITIZER_APPLE
+#  include "tsan_flags.h"
+#endif
 #include "tsan_interceptors.h"
 #include "tsan_interface.h"
 #include "tsan_mman.h"
@@ -1665,6 +1668,14 @@ TSAN_INTERCEPTOR(int, pthread_barrier_wait, void *b) {
 
 TSAN_INTERCEPTOR(int, pthread_once, void *o, void (*f)()) {
   SCOPED_INTERCEPTOR_RAW(pthread_once, o, f);
+#if SANITIZER_APPLE
+  if (flags()->lock_during_write != kLockDuringAllWrites &&
+      cur_thread_init()->in_internal_write_call) {
+    // This is needed to make it through process launch without hanging
+    f();
+    return 0;
+  }
+#endif
   if (o == 0 || f == 0)
     return errno_EINVAL;
   atomic_uint32_t *a;
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index 0d7247a..b8041d7 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -40,6 +40,13 @@ SANITIZER_WEAK_DEFAULT_IMPL
 void __tsan_test_only_on_fork() {}
 #endif
 
+#if SANITIZER_APPLE
+// Override weak symbol from sanitizer_common
+extern void __tsan_set_in_internal_write_call(bool value) {
+  __tsan::cur_thread_init()->in_internal_write_call = value;
+}
+#endif
+
 namespace __tsan {
 
 #if !SANITIZER_GO
@@ -893,6 +900,13 @@ void ForkChildAfter(ThreadState* thr, uptr pc, bool start_thread) {
     ThreadIgnoreBegin(thr, pc);
     ThreadIgnoreSyncBegin(thr, pc);
   }
+
+#  if SANITIZER_APPLE
+  // This flag can have inheritance disabled - we are the child so act
+  // accordingly
+  if (flags()->lock_during_write == kNoLockDuringWritesCurrentProcess)
+    flags()->lock_during_write = kLockDuringAllWrites;
+#  endif
 }
 #endif
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
index 0b6d5f0..77390f0 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@@ -236,6 +236,10 @@ struct alignas(SANITIZER_CACHE_LINE_SIZE) ThreadState {
 
   const ReportDesc *current_report;
 
+#if SANITIZER_APPLE
+  bool in_internal_write_call;
+#endif
+
   explicit ThreadState(Tid tid);
 };
 
diff --git a/compiler-rt/test/tsan/Darwin/write-interpose.c b/compiler-rt/test/tsan/Darwin/write-interpose.c
new file mode 100644
index 0000000..cbd9a08
--- /dev/null
+++ b/compiler-rt/test/tsan/Darwin/write-interpose.c
@@ -0,0 +1,50 @@
+// Test that dylibs interposing write, and then calling functions intercepted
+// by TSan don't deadlock (self-lock)
+
+// RUN: %clang_tsan %s -o %t
+// RUN: %clang_tsan %s -o %t.dylib -fno-sanitize=thread -dynamiclib -DSHARED_LIB
+
+// Note that running the below command with out `lock_during_write` should
+// deadlock (self-lock)
+// RUN: env DYLD_INSERT_LIBRARIES=%t.dylib TSAN_OPTIONS=verbosity=2:lock_during_write=disable_for_current_process %run %t 2>&1 | FileCheck %s
+
+#include <stdio.h>
+
+#if defined(SHARED_LIB)
+
+// dylib implementation - interposes write() calls
+#  include <os/lock.h>
+#  include <unistd.h>
+
+struct interpose_substitution {
+  const void *replacement;
+  const void *original;
+};
+
+#  define INTERPOSE(replacement, original)                                     \
+    __attribute__((used)) static const struct interpose_substitution           \
+        substitution_##original[]                                              \
+        __attribute__((section("__DATA, __interpose"))) = {                    \
+            {(const void *)(replacement), (const void *)(original)}}
+
+static ssize_t my_write(int fd, const void *buf, size_t count) {
+  struct os_unfair_lock_s lock = OS_UNFAIR_LOCK_INIT;
+  os_unfair_lock_lock(&lock);
+  printf("Interposed write called: fd=%d, count=%zu\n", fd, count);
+  ssize_t res = write(fd, buf, count);
+  os_unfair_lock_unlock(&lock);
+  return res;
+}
+INTERPOSE(my_write, write);
+
+#else // defined(SHARED_LIB)
+
+int main() {
+  printf("Write test completed\n");
+  return 0;
+}
+
+#endif // defined(SHARED_LIB)
+
+// CHECK: Interposed write called: fd={{[0-9]+}}, count={{[0-9]+}}
+// CHECK: Write test completed
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 0ffe27e..f8322a5 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -123,6 +123,10 @@ public:
   virtual Fortran::lower::SymMap::StorageDesc
   getSymbolStorage(SymbolRef sym) = 0;
 
+  /// Return the Symbol Map used to map semantics::Symbol to their SSA
+  /// values in the generated MLIR.
+  virtual Fortran::lower::SymMap &getSymbolMap() = 0;
+
   /// Override lowering of expression with pre-lowered values.
   /// Associate mlir::Value to evaluate::Expr. All subsequent call to
   /// genExprXXX() will replace any occurrence of an overridden
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 780d56f..3a022e1 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -643,6 +643,8 @@ public:
     return localSymbols.lookupStorage(sym);
   }
 
+  Fortran::lower::SymMap &getSymbolMap() override final { return localSymbols; }
+
   void
   overrideExprValues(const Fortran::lower::ExprToValueMap *map) override final {
     exprValueOverrides = map;
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 4a9e494..742f58f 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -20,6 +20,7 @@
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/StatementContext.h"
 #include "flang/Lower/Support/Utils.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/Complex.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -33,6 +34,7 @@
 #include "flang/Semantics/scope.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
@@ -60,6 +62,16 @@ static llvm::cl::opt<bool> lowerDoLoopToAccLoop(
     llvm::cl::desc("Whether to lower do loops as `acc.loop` operations."),
     llvm::cl::init(true));
 
+static llvm::cl::opt<bool> enableSymbolRemapping(
+    "openacc-remap-symbols",
+    llvm::cl::desc("Whether to remap symbols that appears in data clauses."),
+    llvm::cl::init(true));
+
+static llvm::cl::opt<bool> enableDevicePtrRemap(
+    "openacc-remap-device-ptr-symbols",
+    llvm::cl::desc("sub-option of openacc-remap-symbols for deviceptr clause"),
+    llvm::cl::init(false));
+
 // Special value for * passed in device_type or gang clauses.
 static constexpr std::int64_t starCst = -1;
 
@@ -624,17 +636,19 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter,
 }
 
 template <typename Op>
-static void
-genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
-                         Fortran::lower::AbstractConverter &converter,
-                         Fortran::semantics::SemanticsContext &semanticsContext,
-                         Fortran::lower::StatementContext &stmtCtx,
-                         llvm::SmallVectorImpl<mlir::Value> &dataOperands,
-                         mlir::acc::DataClause dataClause, bool structured,
-                         bool implicit, llvm::ArrayRef<mlir::Value> async,
-                         llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
-                         llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
-                         bool setDeclareAttr = false) {
+static void genDataOperandOperations(
+    const Fortran::parser::AccObjectList &objectList,
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::StatementContext &stmtCtx,
+    llvm::SmallVectorImpl<mlir::Value> &dataOperands,
+    mlir::acc::DataClause dataClause, bool structured, bool implicit,
+    llvm::ArrayRef<mlir::Value> async,
+    llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
+    llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
+    bool setDeclareAttr = false,
+    llvm::SmallVectorImpl<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        *symbolPairs = nullptr) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
   const bool unwrapBoxAddr = true;
@@ -655,6 +669,9 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
             /*strideIncludeLowerExtent=*/strideIncludeLowerExtent);
     LLVM_DEBUG(llvm::dbgs() << __func__ << "\n"; info.dump(llvm::dbgs()));
 
+    bool isWholeSymbol =
+        !designator || Fortran::evaluate::UnwrapWholeSymbolDataRef(*designator);
+
     // If the input value is optional and is not a descriptor, we use the
     // rawInput directly.
     mlir::Value baseAddr = ((fir::unwrapRefType(info.addr.getType()) !=
@@ -668,6 +685,11 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
         asyncOnlyDeviceTypes, unwrapBoxAddr, info.isPresent);
     dataOperands.push_back(op.getAccVar());
 
+    // Track the symbol and its corresponding mlir::Value if requested
+    if (symbolPairs && isWholeSymbol)
+      symbolPairs->emplace_back(op.getAccVar(),
+                                Fortran::semantics::SymbolRef(symbol));
+
     // For UseDeviceOp, if operand is one of a pair resulting from a
     // declare operation, create a UseDeviceOp for the other operand as well.
     if constexpr (std::is_same_v<Op, mlir::acc::UseDeviceOp>) {
@@ -681,6 +703,8 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
                                         asyncDeviceTypes, asyncOnlyDeviceTypes,
                                         unwrapBoxAddr, info.isPresent);
           dataOperands.push_back(op.getAccVar());
+          // Not adding this to symbolPairs because it only make sense to
+          // map the symbol to a single value.
         }
       }
     }
@@ -1264,7 +1288,9 @@ static void genPrivatizationRecipes(
     llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
     llvm::ArrayRef<mlir::Value> async,
     llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
-    llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes) {
+    llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
+    llvm::SmallVectorImpl<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        *symbolPairs = nullptr) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
   for (const auto &accObject : objectList.v) {
@@ -1284,6 +1310,9 @@ static void genPrivatizationRecipes(
             /*strideIncludeLowerExtent=*/strideIncludeLowerExtent);
     LLVM_DEBUG(llvm::dbgs() << __func__ << "\n"; info.dump(llvm::dbgs()));
 
+    bool isWholeSymbol =
+        !designator || Fortran::evaluate::UnwrapWholeSymbolDataRef(*designator);
+
     RecipeOp recipe;
     mlir::Type retTy = getTypeFromBounds(bounds, info.addr.getType());
     if constexpr (std::is_same_v<RecipeOp, mlir::acc::PrivateRecipeOp>) {
@@ -1297,6 +1326,11 @@ static void genPrivatizationRecipes(
           /*implicit=*/false, mlir::acc::DataClause::acc_private, retTy, async,
           asyncDeviceTypes, asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true);
       dataOperands.push_back(op.getAccVar());
+
+      // Track the symbol and its corresponding mlir::Value if requested
+      if (symbolPairs && isWholeSymbol)
+        symbolPairs->emplace_back(op.getAccVar(),
+                                  Fortran::semantics::SymbolRef(symbol));
     } else {
       std::string suffix =
           areAllBoundConstant(bounds) ? getBoundsString(bounds) : "";
@@ -1310,6 +1344,11 @@ static void genPrivatizationRecipes(
           async, asyncDeviceTypes, asyncOnlyDeviceTypes,
           /*unwrapBoxAddr=*/true);
       dataOperands.push_back(op.getAccVar());
+
+      // Track the symbol and its corresponding mlir::Value if requested
+      if (symbolPairs && isWholeSymbol)
+        symbolPairs->emplace_back(op.getAccVar(),
+                                  Fortran::semantics::SymbolRef(symbol));
     }
     privatizationRecipes.push_back(mlir::SymbolRefAttr::get(
         builder.getContext(), recipe.getSymName().str()));
@@ -1949,15 +1988,16 @@ mlir::Type getTypeFromIvTypeSize(fir::FirOpBuilder &builder,
   return builder.getIntegerType(ivTypeSize * 8);
 }
 
-static void
-privatizeIv(Fortran::lower::AbstractConverter &converter,
-            const Fortran::semantics::Symbol &sym, mlir::Location loc,
-            llvm::SmallVector<mlir::Type> &ivTypes,
-            llvm::SmallVector<mlir::Location> &ivLocs,
-            llvm::SmallVector<mlir::Value> &privateOperands,
-            llvm::SmallVector<mlir::Value> &ivPrivate,
-            llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
-            bool isDoConcurrent = false) {
+static void privatizeIv(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::semantics::Symbol &sym, mlir::Location loc,
+    llvm::SmallVector<mlir::Type> &ivTypes,
+    llvm::SmallVector<mlir::Location> &ivLocs,
+    llvm::SmallVector<mlir::Value> &privateOperands,
+    llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        &ivPrivate,
+    llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+    bool isDoConcurrent = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
   mlir::Type ivTy = getTypeFromIvTypeSize(builder, sym);
@@ -2001,15 +2041,8 @@ privatizeIv(Fortran::lower::AbstractConverter &converter,
         builder.getContext(), recipe.getSymName().str()));
   }
 
-  // Map the new private iv to its symbol for the scope of the loop. bindSymbol
-  // might create a hlfir.declare op, if so, we map its result in order to
-  // use the sym value in the scope.
-  converter.bindSymbol(sym, mlir::acc::getAccVar(privateOp));
-  auto privateValue = converter.getSymbolAddress(sym);
-  if (auto declareOp =
-          mlir::dyn_cast<hlfir::DeclareOp>(privateValue.getDefiningOp()))
-    privateValue = declareOp.getResults()[0];
-  ivPrivate.push_back(privateValue);
+  ivPrivate.emplace_back(mlir::acc::getAccVar(privateOp),
+                         Fortran::semantics::SymbolRef(sym));
 }
 
 static void determineDefaultLoopParMode(
@@ -2088,7 +2121,8 @@ static void processDoLoopBounds(
     llvm::SmallVector<mlir::Value> &upperbounds,
     llvm::SmallVector<mlir::Value> &steps,
     llvm::SmallVector<mlir::Value> &privateOperands,
-    llvm::SmallVector<mlir::Value> &ivPrivate,
+    llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        &ivPrivate,
     llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
     llvm::SmallVector<mlir::Type> &ivTypes,
     llvm::SmallVector<mlir::Location> &ivLocs,
@@ -2178,26 +2212,122 @@ static void processDoLoopBounds(
   }
 }
 
-static mlir::acc::LoopOp
-buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
-               mlir::Location currentLocation,
-               Fortran::semantics::SemanticsContext &semanticsContext,
-               Fortran::lower::StatementContext &stmtCtx,
-               const Fortran::parser::DoConstruct &outerDoConstruct,
-               Fortran::lower::pft::Evaluation &eval,
-               llvm::SmallVector<mlir::Value> &privateOperands,
-               llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
-               llvm::SmallVector<mlir::Value> &gangOperands,
-               llvm::SmallVector<mlir::Value> &workerNumOperands,
-               llvm::SmallVector<mlir::Value> &vectorOperands,
-               llvm::SmallVector<mlir::Value> &tileOperands,
-               llvm::SmallVector<mlir::Value> &cacheOperands,
-               llvm::SmallVector<mlir::Value> &reductionOperands,
-               llvm::SmallVector<mlir::Type> &retTy, mlir::Value yieldValue,
-               uint64_t loopsToProcess) {
+/// Remap symbols that appeared in OpenACC data clauses to use the results of
+/// the corresponding data operations. This allows isolating symbol accesses
+/// inside the OpenACC region from accesses in the host and other regions while
+/// preserving Fortran information about the symbols for optimizations.
+template <typename RegionOp>
+static void remapDataOperandSymbols(
+    Fortran::lower::AbstractConverter &converter, fir::FirOpBuilder &builder,
+    RegionOp &regionOp,
+    const llvm::SmallVector<
+        std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        &dataOperandSymbolPairs) {
+  if (!enableSymbolRemapping || dataOperandSymbolPairs.empty())
+    return;
+
+  // Map Symbols that appeared inside data clauses to a new hlfir.declare whose
+  // input is the acc data operation result.
+  // This allows isolating all the symbol accesses inside the compute region
+  // from accesses in the host and other regions while preserving the Fortran
+  // information about the symbols for Fortran specific optimizations inside the
+  // region.
+  Fortran::lower::SymMap &symbolMap = converter.getSymbolMap();
+  mlir::OpBuilder::InsertionGuard insertGuard(builder);
+  builder.setInsertionPointToStart(&regionOp.getRegion().front());
+  llvm::SmallPtrSet<const Fortran::semantics::Symbol *, 8> seenSymbols;
+  mlir::IRMapping mapper;
+  for (auto [value, symbol] : dataOperandSymbolPairs) {
+
+    // If A symbol appears on several data clause, just map it to the first
+    // result (all data operations results for a symbol are pointing same
+    // memory, so it does not matter which one is used).
+    if (seenSymbols.contains(&symbol.get()))
+      continue;
+    seenSymbols.insert(&symbol.get());
+    std::optional<fir::FortranVariableOpInterface> hostDef =
+        symbolMap.lookupVariableDefinition(symbol);
+    assert(hostDef.has_value() && llvm::isa<hlfir::DeclareOp>(*hostDef) &&
+           "expected symbol to be mapped to hlfir.declare");
+    auto hostDeclare = llvm::cast<hlfir::DeclareOp>(*hostDef);
+    // Replace base input and DummyScope inputs.
+    mlir::Value hostInput = hostDeclare.getMemref();
+    mlir::Type hostType = hostInput.getType();
+    mlir::Type computeType = value.getType();
+    if (hostType == computeType) {
+      mapper.map(hostInput, value);
+    } else if (llvm::isa<fir::BaseBoxType>(computeType)) {
+      assert(!llvm::isa<fir::BaseBoxType>(hostType) &&
+             "box type mismatch between compute region variable and "
+             "hlfir.declare input unexpected");
+      if (Fortran::semantics::IsOptional(symbol))
+        TODO(regionOp.getLoc(),
+             "remapping OPTIONAL symbol in OpenACC compute region");
+      auto rawValue =
+          fir::BoxAddrOp::create(builder, regionOp.getLoc(), hostType, value);
+      mapper.map(hostInput, rawValue);
+    } else {
+      assert(!llvm::isa<fir::BaseBoxType>(hostType) &&
+             "compute region variable should not be raw address when host "
+             "hlfir.declare input was a box");
+      assert(fir::isBoxAddress(hostType) == fir::isBoxAddress(computeType) &&
+             "compute region variable should be a pointer/allocatable if and "
+             "only if host is");
+      assert(fir::isa_ref_type(hostType) && fir::isa_ref_type(computeType) &&
+             "compute region variable and host variable should both be raw "
+             "addresses");
+      mlir::Value cast =
+          builder.createConvert(regionOp.getLoc(), hostType, value);
+      mapper.map(hostInput, cast);
+    }
+    if (mlir::Value dummyScope = hostDeclare.getDummyScope()) {
+      // Copy the dummy scope into the region so that aliasing rules about
+      // Fortran dummies are understood inside the region and the abstract dummy
+      // scope type does not have to cross the OpenACC compute region boundary.
+      if (!mapper.contains(dummyScope)) {
+        mlir::Operation *hostDummyScopeOp = dummyScope.getDefiningOp();
+        assert(hostDummyScopeOp &&
+               "dummyScope defining operation must be visible in lowering");
+        (void)builder.clone(*hostDummyScopeOp, mapper);
+      }
+    }
+
+    mlir::Operation *computeDef =
+        builder.clone(*hostDeclare.getOperation(), mapper);
+
+    // The input box already went through an hlfir.declare. It has the correct
+    // local lower bounds and attribute. Do not generate a new fir.rebox.
+    if (llvm::isa<fir::BaseBoxType>(hostDeclare.getMemref().getType()))
+      llvm::cast<hlfir::DeclareOp>(*computeDef).setSkipRebox(true);
+
+    symbolMap.addVariableDefinition(
+        symbol, llvm::cast<fir::FortranVariableOpInterface>(computeDef));
+  }
+}
+
+static mlir::acc::LoopOp buildACCLoopOp(
+    Fortran::lower::AbstractConverter &converter,
+    mlir::Location currentLocation,
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::DoConstruct &outerDoConstruct,
+    Fortran::lower::pft::Evaluation &eval,
+    llvm::SmallVector<mlir::Value> &privateOperands,
+    llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+    llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        &dataOperandSymbolPairs,
+    llvm::SmallVector<mlir::Value> &gangOperands,
+    llvm::SmallVector<mlir::Value> &workerNumOperands,
+    llvm::SmallVector<mlir::Value> &vectorOperands,
+    llvm::SmallVector<mlir::Value> &tileOperands,
+    llvm::SmallVector<mlir::Value> &cacheOperands,
+    llvm::SmallVector<mlir::Value> &reductionOperands,
+    llvm::SmallVector<mlir::Type> &retTy, mlir::Value yieldValue,
+    uint64_t loopsToProcess) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
-  llvm::SmallVector<mlir::Value> ivPrivate;
+  llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+      ivPrivate;
   llvm::SmallVector<mlir::Type> ivTypes;
   llvm::SmallVector<mlir::Location> ivLocs;
   llvm::SmallVector<bool> inclusiveBounds;
@@ -2231,10 +2361,22 @@ buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
       builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
       operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
       ivLocs);
-
-  for (auto [arg, value] : llvm::zip(
-           loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
-    fir::StoreOp::create(builder, currentLocation, arg, value);
+  // Ensure the iv symbol is mapped to private iv SSA value for the scope of
+  // the loop even if it did not appear explicitly in a PRIVATE clause (if it
+  // appeared explicitly in such clause, that is also fine because duplicates
+  // in the list are ignored).
+  dataOperandSymbolPairs.append(ivPrivate.begin(), ivPrivate.end());
+  // Remap symbols from data clauses to use data operation results
+  remapDataOperandSymbols(converter, builder, loopOp, dataOperandSymbolPairs);
+
+  for (auto [arg, iv] :
+       llvm::zip(loopOp.getLoopRegions().front()->front().getArguments(),
+                 ivPrivate)) {
+    // Store block argument to the related iv private variable.
+    mlir::Value privateValue =
+        converter.getSymbolAddress(std::get<Fortran::semantics::SymbolRef>(iv));
+    fir::StoreOp::create(builder, currentLocation, arg, privateValue);
+  }
 
   loopOp.setInclusiveUpperbound(inclusiveBounds);
 
@@ -2260,6 +2402,10 @@ static mlir::acc::LoopOp createLoopOp(
   llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments;
   llvm::SmallVector<int64_t> collapseValues;
 
+  // Vector to track mlir::Value results and their corresponding Fortran symbols
+  llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+      dataOperandSymbolPairs;
+
   llvm::SmallVector<mlir::Attribute> gangArgTypes;
   llvm::SmallVector<mlir::Attribute> seqDeviceTypes, independentDeviceTypes,
       autoDeviceTypes, vectorOperandsDeviceTypes, workerNumOperandsDeviceTypes,
@@ -2380,7 +2526,8 @@ static mlir::acc::LoopOp createLoopOp(
       genPrivatizationRecipes<mlir::acc::PrivateRecipeOp>(
           privateClause->v, converter, semanticsContext, stmtCtx,
           privateOperands, privatizationRecipes, /*async=*/{},
-          /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+          /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{},
+          &dataOperandSymbolPairs);
     } else if (const auto *reductionClause =
                    std::get_if<Fortran::parser::AccClause::Reduction>(
                        &clause.u)) {
@@ -2436,9 +2583,9 @@ static mlir::acc::LoopOp createLoopOp(
       Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
   auto loopOp = buildACCLoopOp(
       converter, currentLocation, semanticsContext, stmtCtx, outerDoConstruct,
-      eval, privateOperands, privatizationRecipes, gangOperands,
-      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
-      reductionOperands, retTy, yieldValue, loopsToProcess);
+      eval, privateOperands, privatizationRecipes, dataOperandSymbolPairs,
+      gangOperands, workerNumOperands, vectorOperands, tileOperands,
+      cacheOperands, reductionOperands, retTy, yieldValue, loopsToProcess);
 
   if (!gangDeviceTypes.empty())
     loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes));
@@ -2568,7 +2715,9 @@ static void genDataOperandOperationsWithModifier(
     llvm::ArrayRef<mlir::Value> async,
     llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
     llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
-    bool setDeclareAttr = false) {
+    bool setDeclareAttr = false,
+    llvm::SmallVectorImpl<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        *symbolPairs = nullptr) {
   const Fortran::parser::AccObjectListWithModifier &listWithModifier = x->v;
   const auto &accObjectList =
       std::get<Fortran::parser::AccObjectList>(listWithModifier.t);
@@ -2581,7 +2730,7 @@ static void genDataOperandOperationsWithModifier(
                                stmtCtx, dataClauseOperands, dataClause,
                                /*structured=*/true, /*implicit=*/false, async,
                                asyncDeviceTypes, asyncOnlyDeviceTypes,
-                               setDeclareAttr);
+                               setDeclareAttr, symbolPairs);
 }
 
 template <typename Op>
@@ -2612,6 +2761,10 @@ static Op createComputeOp(
   llvm::SmallVector<mlir::Attribute> privatizationRecipes,
       firstPrivatizationRecipes, reductionRecipes;
 
+  // Vector to track mlir::Value results and their corresponding Fortran symbols
+  llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+      dataOperandSymbolPairs;
+
   // Self clause has optional values but can be present with
   // no value as well. When there is no value, the op has an attribute to
   // represent the clause.
@@ -2732,7 +2885,8 @@ static Op createComputeOp(
           copyClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_copy,
           /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+          &dataOperandSymbolPairs);
       copyEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                dataClauseOperands.end());
     } else if (const auto *copyinClause =
@@ -2744,7 +2898,8 @@ static Op createComputeOp(
           Fortran::parser::AccDataModifier::Modifier::ReadOnly,
           dataClauseOperands, mlir::acc::DataClause::acc_copyin,
           mlir::acc::DataClause::acc_copyin_readonly, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+          &dataOperandSymbolPairs);
       copyinEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
     } else if (const auto *copyoutClause =
@@ -2757,7 +2912,8 @@ static Op createComputeOp(
           Fortran::parser::AccDataModifier::Modifier::ReadOnly,
           dataClauseOperands, mlir::acc::DataClause::acc_copyout,
           mlir::acc::DataClause::acc_copyout_zero, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+          &dataOperandSymbolPairs);
       copyoutEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                   dataClauseOperands.end());
     } else if (const auto *createClause =
@@ -2769,7 +2925,8 @@ static Op createComputeOp(
           Fortran::parser::AccDataModifier::Modifier::Zero, dataClauseOperands,
           mlir::acc::DataClause::acc_create,
           mlir::acc::DataClause::acc_create_zero, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+          &dataOperandSymbolPairs);
       createEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
     } else if (const auto *noCreateClause =
@@ -2780,7 +2937,8 @@ static Op createComputeOp(
           noCreateClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_no_create,
           /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+          &dataOperandSymbolPairs);
       nocreateEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                    dataClauseOperands.end());
     } else if (const auto *presentClause =
@@ -2791,17 +2949,21 @@ static Op createComputeOp(
           presentClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_present,
           /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+          &dataOperandSymbolPairs);
       presentEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                   dataClauseOperands.end());
     } else if (const auto *devicePtrClause =
                    std::get_if<Fortran::parser::AccClause::Deviceptr>(
                        &clause.u)) {
+      llvm::SmallVectorImpl<
+          std::pair<mlir::Value, Fortran::semantics::SymbolRef>> *symPairs =
+          enableDevicePtrRemap ? &dataOperandSymbolPairs : nullptr;
       genDataOperandOperations<mlir::acc::DevicePtrOp>(
           devicePtrClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_deviceptr,
           /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false, symPairs);
     } else if (const auto *attachClause =
                    std::get_if<Fortran::parser::AccClause::Attach>(&clause.u)) {
       auto crtDataStart = dataClauseOperands.size();
@@ -2809,7 +2971,8 @@ static Op createComputeOp(
           attachClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_attach,
           /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
-          asyncOnlyDeviceTypes);
+          asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+          &dataOperandSymbolPairs);
       attachEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
     } else if (const auto *privateClause =
@@ -2819,14 +2982,14 @@ static Op createComputeOp(
         genPrivatizationRecipes<mlir::acc::PrivateRecipeOp>(
             privateClause->v, converter, semanticsContext, stmtCtx,
             privateOperands, privatizationRecipes, async, asyncDeviceTypes,
-            asyncOnlyDeviceTypes);
+            asyncOnlyDeviceTypes, &dataOperandSymbolPairs);
     } else if (const auto *firstprivateClause =
                    std::get_if<Fortran::parser::AccClause::Firstprivate>(
                        &clause.u)) {
       genPrivatizationRecipes<mlir::acc::FirstprivateRecipeOp>(
           firstprivateClause->v, converter, semanticsContext, stmtCtx,
           firstprivateOperands, firstPrivatizationRecipes, async,
-          asyncDeviceTypes, asyncOnlyDeviceTypes);
+          asyncDeviceTypes, asyncOnlyDeviceTypes, &dataOperandSymbolPairs);
     } else if (const auto *reductionClause =
                    std::get_if<Fortran::parser::AccClause::Reduction>(
                        &clause.u)) {
@@ -2846,7 +3009,8 @@ static Op createComputeOp(
             converter, semanticsContext, stmtCtx, dataClauseOperands,
             mlir::acc::DataClause::acc_reduction,
             /*structured=*/true, /*implicit=*/true, async, asyncDeviceTypes,
-            asyncOnlyDeviceTypes);
+            asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
+            &dataOperandSymbolPairs);
         copyEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
       }
@@ -2945,6 +3109,11 @@ static Op createComputeOp(
     computeOp.setCombinedAttr(builder.getUnitAttr());
 
   auto insPt = builder.saveInsertionPoint();
+
+  // Remap symbols from data clauses to use data operation results
+  remapDataOperandSymbols(converter, builder, computeOp,
+                          dataOperandSymbolPairs);
+
   builder.setInsertionPointAfter(computeOp);
 
   // Create the exit operations after the region.
@@ -4921,6 +5090,8 @@ mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
       reductionOperands;
   llvm::SmallVector<mlir::Attribute> privatizationRecipes;
   llvm::SmallVector<mlir::Type> retTy;
+  llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+      dataOperandSymbolPairs;
   mlir::Value yieldValue;
   uint64_t loopsToProcess = 1; // Single loop construct
 
@@ -4929,9 +5100,10 @@ mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
   Fortran::lower::StatementContext stmtCtx;
   auto loopOp = buildACCLoopOp(
       converter, converter.getCurrentLocation(), semanticsContext, stmtCtx,
-      doConstruct, eval, privateOperands, privatizationRecipes, gangOperands,
-      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
-      reductionOperands, retTy, yieldValue, loopsToProcess);
+      doConstruct, eval, privateOperands, privatizationRecipes,
+      dataOperandSymbolPairs, gangOperands, workerNumOperands, vectorOperands,
+      tileOperands, cacheOperands, reductionOperands, retTy, yieldValue,
+      loopsToProcess);
 
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   if (!privatizationRecipes.empty())
diff --git a/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90 b/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90
new file mode 100644
index 0000000..9d36f6a
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90
@@ -0,0 +1,601 @@
+! Test remapping of variables appearing in OpenACC data clauses
+! to the related acc dialect data operation result.
+
+! This tests checks how the hlfir.declare is recreated and used inside
+! the acc compute region.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+module m
+interface
+subroutine takes_scalar(x)
+     real :: x
+end subroutine
+subroutine takes_scalar_character(c, l)
+     integer :: l
+     character(l) :: c
+end subroutine
+subroutine takes_explicit_cst_shape(x)
+     real :: x(100)
+end subroutine
+subroutine takes_explicit_shape(x, n)
+     real :: x(n)
+end subroutine
+subroutine takes_assumed_shape(x)
+     real :: x(:)
+end subroutine
+subroutine takes_pointer(x)
+     real, pointer :: x(:)
+end subroutine
+
+subroutine takes_optional_scalar(x)
+     real, optional :: x
+end subroutine
+subroutine takes_optional_explicit_cst_shape(x)
+     real, optional :: x(100)
+end subroutine
+subroutine takes_optional_explicit_shape(x, n)
+     real, optional :: x(n)
+end subroutine
+subroutine takes_optional_assumed_shape(x)
+     real, optional :: x(:)
+end subroutine
+subroutine takes_optional_pointer(x)
+     real, optional, pointer :: x(:)
+end subroutine
+end interface
+contains
+
+! ----------------------------- Test forwarding ------------------------------ !
+
+   subroutine test_scalar(x)
+     real :: x
+     !$acc parallel copy(x)
+        call takes_scalar(x)
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_scalar_character(c, l)
+      integer :: l
+      character(l) :: c
+     !$acc parallel copy(x)
+        call takes_scalar_character(c, len(c))
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_cst_shape(x)
+     real :: x(100)
+     !$acc parallel copy(x)
+        call takes_explicit_cst_shape(x)
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_explicit_shape(x, n)
+     real :: x(n)
+     !$acc parallel copy(x)
+        call takes_explicit_shape(x, size(x,dim=1))
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_assumed_shape(x, n)
+     real :: x(:)
+     !$acc parallel copy(x)
+        call takes_assumed_shape(x)
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_contiguous_assumed_shape(x, n)
+     real, contiguous :: x(:)
+     !$acc parallel copy(x)
+        call takes_explicit_shape(x, size(x,dim=1))
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_pointer(x, n)
+     real, pointer :: x(:)
+     !$acc parallel copy(x)
+        call takes_pointer(x)
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_using_both_results(x, n)
+     real :: x(n)
+     !$acc parallel copy(x)
+        ! using hlfir.declare result #0
+        call takes_assumed_shape(x)
+        ! using hlfir.declare result #1
+        call takes_explicit_shape(x, size(x,dim=1))
+     !$acc end parallel
+   end subroutine
+
+! ------------------------- Test array addressing ---------------------------- !
+
+   subroutine addressing_cst_shape(x)
+     real :: x(10, 20)
+     !$acc parallel copy(x)
+        call takes_scalar(x(2,3))
+     !$acc end parallel
+   end subroutine
+
+   subroutine addressing_explicit_shape(x, n, m)
+     real :: x(n, m)
+     !$acc parallel copy(x)
+        call takes_scalar(x(2,3))
+     !$acc end parallel
+   end subroutine
+
+   subroutine addressing_assumed_shape(x, n)
+     real :: x(:, :)
+     !$acc parallel copy(x)
+        call takes_scalar(x(2,3))
+     !$acc end parallel
+   end subroutine
+
+   subroutine addressing_contiguous_assumed_shape(x, n)
+     real, contiguous :: x(:, :)
+     !$acc parallel copy(x)
+        call takes_scalar(x(2,3))
+     !$acc end parallel
+   end subroutine
+
+   subroutine addressing_pointer(x)
+     real, pointer :: x(:, :)
+     !$acc parallel copy(x)
+        call takes_scalar(x(2,3))
+     !$acc end parallel
+   end subroutine
+
+! ------------------------ Test OPTIONAL handling ---------------------------- !
+
+   subroutine test_optional_scalar(x)
+     real, optional :: x
+     !$acc parallel copy(x)
+        call takes_optional_scalar(x)
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_optional_explicit_cst_shape(x)
+      real, optional :: x(100)
+     !$acc parallel copy(x)
+        call takes_optional_explicit_cst_shape(x)
+     !$acc end parallel
+   end subroutine
+
+   subroutine test_optional_explicit_shape(x, n)
+      real, optional :: x(n)
+      !$acc parallel copy(x)
+        call takes_optional_explicit_shape(x, n)
+      !$acc end parallel
+   end subroutine
+
+   subroutine test_optional_assumed_shape(x)
+      real, optional :: x(:)
+      !$acc parallel copy(x)
+        call takes_optional_assumed_shape(x)
+      !$acc end parallel
+   end subroutine
+
+   subroutine test_optional_pointer(x)
+      real, optional, pointer :: x(:)
+      !$acc parallel copy(x)
+        call takes_optional_pointer(x)
+      !$acc end parallel
+   end subroutine
+
+end module
+
+! CHECK-LABEL:   func.func @_QMmPtest_scalar(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<f32>) {
+! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:             fir.call @_QPtakes_scalar(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_2]] : !fir.ref<f32>) to varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_scalar_character(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "l"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalar_characterEl"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QMmFtest_scalar_characterEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMmFtest_scalar_characterEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : i32
+! CHECK:           %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : i32
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_8]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, i32, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_10:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.ref<f32>) {
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QMmFtest_scalar_characterEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:             %[[VAL_12:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:             fir.call @_QPtakes_scalar_character(%[[VAL_9]]#0, %[[VAL_12]]#0) fastmath<contract> : (!fir.boxchar<1>, !fir.ref<i32>) -> ()
+! CHECK:             hlfir.end_associate %[[VAL_12]]#1, %[[VAL_12]]#2 : !fir.ref<i32>, i1
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_10]] : !fir.ref<f32>) to varPtr(%[[VAL_3]]#0 : !fir.ref<f32>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_cst_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:           %[[VAL_4:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<100xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) {
+! CHECK:             %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:             fir.call @_QPtakes_explicit_cst_shape(%[[VAL_6]]#0) fastmath<contract> : (!fir.ref<!fir.array<100xf32>>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) to varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_explicit_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_10:.*]] = acc.copyin var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) {
+! CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:             %[[VAL_12:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_7]] : (index) -> i64
+! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32
+! CHECK:             %[[VAL_16:.*]]:3 = hlfir.associate %[[VAL_15]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:             fir.call @_QPtakes_explicit_shape(%[[VAL_13]]#1, %[[VAL_16]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>, !fir.ref<i32>) -> ()
+! CHECK:             hlfir.end_associate %[[VAL_16]]#1, %[[VAL_16]]#2 : !fir.ref<i32>, i1
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) to var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_assumed_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_3:.*]] = acc.copyin var(%[[VAL_2]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_3]] : !fir.box<!fir.array<?xf32>>) {
+! CHECK:             %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] skip_rebox {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:             fir.call @_QPtakes_assumed_shape(%[[VAL_5]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_3]] : !fir.box<!fir.array<?xf32>>) to var(%[[VAL_2]]#0 : !fir.box<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_contiguous_assumed_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.contiguous},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[ARG0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_4]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_8:.*]] = acc.copyin var(%[[VAL_7]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_8]] : !fir.box<!fir.array<?xf32>>) {
+! CHECK:             %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:             %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_6]]) dummy_scope %[[VAL_10]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_4]]#1 : (index) -> i64
+! CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i64) -> i32
+! CHECK:             %[[VAL_14:.*]]:3 = hlfir.associate %[[VAL_13]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:             fir.call @_QPtakes_explicit_shape(%[[VAL_11]]#1, %[[VAL_14]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>, !fir.ref<i32>) -> ()
+! CHECK:             hlfir.end_associate %[[VAL_14]]#1, %[[VAL_14]]#2 : !fir.ref<i32>, i1
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_8]] : !fir.box<!fir.array<?xf32>>) to var(%[[VAL_7]]#0 : !fir.box<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_pointer(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_pointerEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_3:.*]] = acc.copyin varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
+! CHECK:             %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:             fir.call @_QPtakes_pointer(%[[VAL_5]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) to varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_using_both_results(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_using_both_resultsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_10:.*]] = acc.copyin var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) {
+! CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:             %[[VAL_12:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             fir.call @_QPtakes_assumed_shape(%[[VAL_13]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_7]] : (index) -> i64
+! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32
+! CHECK:             %[[VAL_16:.*]]:3 = hlfir.associate %[[VAL_15]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:             fir.call @_QPtakes_explicit_shape(%[[VAL_13]]#1, %[[VAL_16]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>, !fir.ref<i32>) -> ()
+! CHECK:             hlfir.end_associate %[[VAL_16]]#1, %[[VAL_16]]#2 : !fir.ref<i32>, i1
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) to var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPaddressing_cst_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<10x20xf32>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
+! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
+! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_3]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_5:.*]] = acc.copyin varPtr(%[[VAL_4]]#0 : !fir.ref<!fir.array<10x20xf32>>) -> !fir.ref<!fir.array<10x20xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_5]] : !fir.ref<!fir.array<10x20xf32>>) {
+! CHECK:             %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_3]]) dummy_scope %[[VAL_6]] {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:             %[[VAL_8:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_9:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_10:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_8]], %[[VAL_9]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:             fir.call @_QPtakes_scalar(%[[VAL_10]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_5]] : !fir.ref<!fir.array<10x20xf32>>) to varPtr(%[[VAL_4]]#0 : !fir.ref<!fir.array<10x20xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPaddressing_explicit_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "x"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"},
+! CHECK-SAME:      %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "m"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEm"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : index
+! CHECK:           %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : index
+! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
+! CHECK:           %[[VAL_12:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_8]], %[[VAL_14]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_15]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_17:.*]] = acc.copyin var(%[[VAL_16]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_17]] : !fir.box<!fir.array<?x?xf32>>) {
+! CHECK:             %[[VAL_18:.*]] = fir.box_addr %[[VAL_17]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+! CHECK:             %[[VAL_19:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[VAL_15]]) dummy_scope %[[VAL_19]] {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:             %[[VAL_21:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_22:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_23:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_21]], %[[VAL_22]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:             fir.call @_QPtakes_scalar(%[[VAL_23]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_17]] : !fir.box<!fir.array<?x?xf32>>) to var(%[[VAL_16]]#0 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPaddressing_assumed_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_3:.*]] = acc.copyin var(%[[VAL_2]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_3]] : !fir.box<!fir.array<?x?xf32>>) {
+! CHECK:             %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] skip_rebox {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:             %[[VAL_6:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_7:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_6]], %[[VAL_7]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:             fir.call @_QPtakes_scalar(%[[VAL_8]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_3]] : !fir.box<!fir.array<?x?xf32>>) to var(%[[VAL_2]]#0 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPaddressing_contiguous_assumed_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.contiguous},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[ARG0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_7:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_6]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_9:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_4]]#1, %[[VAL_8]], %[[VAL_7]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_9]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_11:.*]] = acc.copyin var(%[[VAL_10]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_11]] : !fir.box<!fir.array<?x?xf32>>) {
+! CHECK:             %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+! CHECK:             %[[VAL_13:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_9]]) dummy_scope %[[VAL_13]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:             %[[VAL_15:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_16:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_14]]#0 (%[[VAL_15]], %[[VAL_16]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:             fir.call @_QPtakes_scalar(%[[VAL_17]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_11]] : !fir.box<!fir.array<?x?xf32>>) to var(%[[VAL_10]]#0 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPaddressing_pointer(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) {
+! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:             %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
+! CHECK:             %[[VAL_6:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_7:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]] (%[[VAL_6]], %[[VAL_7]])  : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>, index, index) -> !fir.ref<f32>
+! CHECK:             fir.call @_QPtakes_scalar(%[[VAL_8]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) to varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_optional_scalar(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "x", fir.optional}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<f32>) {
+! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:             %[[VAL_5:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.ref<f32>) -> i1
+! CHECK:             %[[VAL_6:.*]] = fir.if %[[VAL_5]] -> (!fir.ref<f32>) {
+! CHECK:               fir.result %[[VAL_4]]#0 : !fir.ref<f32>
+! CHECK:             } else {
+! CHECK:               %[[VAL_7:.*]] = fir.absent !fir.ref<f32>
+! CHECK:               fir.result %[[VAL_7]] : !fir.ref<f32>
+! CHECK:             }
+! CHECK:             fir.call @_QPtakes_optional_scalar(%[[VAL_6]]) fastmath<contract> : (!fir.ref<f32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_2]] : !fir.ref<f32>) to varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_optional_explicit_cst_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "x", fir.optional}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:           %[[VAL_4:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<100xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) {
+! CHECK:             %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:             %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.ref<!fir.array<100xf32>>) -> i1
+! CHECK:             %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.ref<!fir.array<100xf32>>) {
+! CHECK:               fir.result %[[VAL_6]]#0 : !fir.ref<!fir.array<100xf32>>
+! CHECK:             } else {
+! CHECK:               %[[VAL_9:.*]] = fir.absent !fir.ref<!fir.array<100xf32>>
+! CHECK:               fir.result %[[VAL_9]] : !fir.ref<!fir.array<100xf32>>
+! CHECK:             }
+! CHECK:             fir.call @_QPtakes_optional_explicit_cst_shape(%[[VAL_8]]) fastmath<contract> : (!fir.ref<!fir.array<100xf32>>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) to varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_optional_explicit_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional},
+! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_optional_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_10:.*]] = acc.copyin varPtr(%[[VAL_9]]#1 : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.ref<!fir.array<?xf32>>) {
+! CHECK:             %[[VAL_11:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_8]]) dummy_scope %[[VAL_11]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_13:.*]] = fir.is_present %[[VAL_12]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
+! CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.ref<!fir.array<?xf32>>) {
+! CHECK:               fir.result %[[VAL_12]]#1 : !fir.ref<!fir.array<?xf32>>
+! CHECK:             } else {
+! CHECK:               %[[VAL_15:.*]] = fir.absent !fir.ref<!fir.array<?xf32>>
+! CHECK:               fir.result %[[VAL_15]] : !fir.ref<!fir.array<?xf32>>
+! CHECK:             }
+! CHECK:             fir.call @_QPtakes_optional_explicit_shape(%[[VAL_14]], %[[VAL_1]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>, !fir.ref<i32>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_10]] : !fir.ref<!fir.array<?xf32>>) to varPtr(%[[VAL_9]]#1 : !fir.ref<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_optional_assumed_shape(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_2:.*]] = acc.copyin var(%[[VAL_1]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.box<!fir.array<?xf32>>) {
+! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] skip_rebox {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_5:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
+! CHECK:             %[[VAL_6:.*]] = fir.if %[[VAL_5]] -> (!fir.box<!fir.array<?xf32>>) {
+! CHECK:               fir.result %[[VAL_4]]#0 : !fir.box<!fir.array<?xf32>>
+! CHECK:             } else {
+! CHECK:               %[[VAL_7:.*]] = fir.absent !fir.box<!fir.array<?xf32>>
+! CHECK:               fir.result %[[VAL_7]] : !fir.box<!fir.array<?xf32>>
+! CHECK:             }
+! CHECK:             fir.call @_QPtakes_optional_assumed_shape(%[[VAL_6]]) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accVar(%[[VAL_2]] : !fir.box<!fir.array<?xf32>>) to var(%[[VAL_1]]#0 : !fir.box<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMmPtest_optional_pointer(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x", fir.optional}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
+! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:             fir.call @_QPtakes_optional_pointer(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> ()
+! CHECK:             acc.yield
+! CHECK:           }
+! CHECK:           acc.copyout accPtr(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) to varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {dataClause = #acc<data_clause acc_copy>, name = "x"}
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
index bc94837b..429f207 100644
--- a/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
@@ -41,19 +41,21 @@ module m_firstprivate_derived_alloc_comp
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = acc.firstprivate varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>> {name = "a"}
 ! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_alloc_compTpoint -> %[[VAL_6]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) {
-! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_7]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_10:.*]] = acc.private varPtr(%[[VAL_3]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_10]] : !fir.ref<i32>) control(%[[VAL_12:.*]] : i32) = (%[[VAL_7]] : i32) to (%[[VAL_8]] : i32)  step (%[[VAL_9]] : i32) {
-! CHECK:               fir.store %[[VAL_12]] to %[[VAL_11]]#0 : !fir.ref<i32>
-! CHECK:               %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK:               %[[VAL_14:.*]] = hlfir.designate %[[VAL_1]]#0{"x"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:               %[[VAL_16:.*]] = arith.constant 10 : index
-! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_15]] (%[[VAL_16]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:               hlfir.assign %[[VAL_13]] to %[[VAL_17]] : f32, !fir.ref<f32>
+! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_12:.*]] = acc.private varPtr(%[[VAL_3]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_12]] : !fir.ref<i32>) control(%[[VAL_14:.*]] : i32) = (%[[VAL_9]] : i32) to (%[[VAL_10]] : i32)  step (%[[VAL_11]] : i32) {
+! CHECK:               %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QMm_firstprivate_derived_alloc_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_16:.*]] = hlfir.designate %[[VAL_8]]#0{"x"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_alloc_compTpoint{x:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_18:.*]] = arith.constant 10 : index
+! CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_19]] : f32, !fir.ref<f32>
 ! CHECK:               acc.yield
 ! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:             acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
index f18d722..9ef4fe6 100644
--- a/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
@@ -41,19 +41,21 @@ module m_firstprivate_derived_ptr_comp
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = acc.firstprivate varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> {name = "a"}
 ! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_ptr_compTpoint -> %[[VAL_6]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) {
-! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_7]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_10:.*]] = acc.private varPtr(%[[VAL_3]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_10]] : !fir.ref<i32>) control(%[[VAL_12:.*]] : i32) = (%[[VAL_7]] : i32) to (%[[VAL_8]] : i32)  step (%[[VAL_9]] : i32) {
-! CHECK:               fir.store %[[VAL_12]] to %[[VAL_11]]#0 : !fir.ref<i32>
-! CHECK:               %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK:               %[[VAL_14:.*]] = hlfir.designate %[[VAL_1]]#0{"x"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_14]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:               %[[VAL_16:.*]] = arith.constant 10 : index
-! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_15]] (%[[VAL_16]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:               hlfir.assign %[[VAL_13]] to %[[VAL_17]] : f32, !fir.ref<f32>
+! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_12:.*]] = acc.private varPtr(%[[VAL_3]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_12]] : !fir.ref<i32>) control(%[[VAL_14:.*]] : i32) = (%[[VAL_9]] : i32) to (%[[VAL_10]] : i32)  step (%[[VAL_11]] : i32) {
+! CHECK:               %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_16:.*]] = hlfir.designate %[[VAL_8]]#0{"x"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:               %[[VAL_18:.*]] = arith.constant 10 : index
+! CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_19]] : f32, !fir.ref<f32>
 ! CHECK:               acc.yield
 ! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:             acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-user-assign.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-user-assign.f90
index f389c46..e90ec32 100644
--- a/flang/test/Lower/OpenACC/acc-firstprivate-derived-user-assign.f90
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-user-assign.f90
@@ -55,16 +55,17 @@ module m_firstprivate_derived_user_def
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = acc.firstprivate varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>> {name = "a"}
 ! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_user_defTpoint -> %[[VAL_7]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) {
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = acc.private varPtr(%[[VAL_4]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_11]] : !fir.ref<i32>) control(%[[VAL_13:.*]] : i32) = (%[[VAL_8]] : i32) to (%[[VAL_9]] : i32)  step (%[[VAL_10]] : i32) {
-! CHECK:               fir.store %[[VAL_13]] to %[[VAL_12]]#0 : !fir.ref<i32>
-! CHECK:               %[[VAL_14:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK:               %[[VAL_15:.*]] = hlfir.designate %[[VAL_2]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
-! CHECK:               hlfir.assign %[[VAL_14]] to %[[VAL_15]] : f32, !fir.ref<f32>
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_12:.*]] = acc.private varPtr(%[[VAL_4]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_12]] : !fir.ref<i32>) control(%[[VAL_14:.*]] : i32) = (%[[VAL_9]] : i32) to (%[[VAL_10]] : i32)  step (%[[VAL_11]] : i32) {
+  ! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QMm_firstprivate_derived_user_defFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_16:.*]] = hlfir.designate %[[VAL_8]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_firstprivate_derived_user_defTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_16]] : f32, !fir.ref<f32>
 ! CHECK:               acc.yield
 ! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:             acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived.f90
index 677c3ae..e91fc9b 100644
--- a/flang/test/Lower/OpenACC/acc-firstprivate-derived.f90
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived.f90
@@ -41,16 +41,17 @@ module m_firstprivate_derived
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QMm_firstprivate_derivedFtestEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = acc.firstprivate varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>> {name = "a"}
 ! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derivedTpoint -> %[[VAL_7]] : !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) {
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = acc.private varPtr(%[[VAL_4]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QMm_firstprivate_derivedFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_11]] : !fir.ref<i32>) control(%[[VAL_13:.*]] : i32) = (%[[VAL_8]] : i32) to (%[[VAL_9]] : i32)  step (%[[VAL_10]] : i32) {
-! CHECK:               fir.store %[[VAL_13]] to %[[VAL_12]]#0 : !fir.ref<i32>
-! CHECK:               %[[VAL_14:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK:               %[[VAL_15:.*]] = hlfir.designate %[[VAL_2]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
-! CHECK:               hlfir.assign %[[VAL_14]] to %[[VAL_15]] : f32, !fir.ref<f32>
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QMm_firstprivate_derivedFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_12:.*]] = acc.private varPtr(%[[VAL_4]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK:             acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[VAL_12]] : !fir.ref<i32>) control(%[[VAL_14:.*]] : i32) = (%[[VAL_9]] : i32) to (%[[VAL_10]] : i32)  step (%[[VAL_11]] : i32) {
+  ! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QMm_firstprivate_derivedFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:               %[[VAL_16:.*]] = hlfir.designate %[[VAL_8]]#0{"x"}   : (!fir.ref<!fir.type<_QMm_firstprivate_derivedTpoint{x:f32,y:f32,z:f32}>>) -> !fir.ref<f32>
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_16]] : f32, !fir.ref<f32>
 ! CHECK:               acc.yield
 ! CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:             acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-loop-exit.f90 b/flang/test/Lower/OpenACC/acc-loop-exit.f90
index 85394e4..af11b34 100644
--- a/flang/test/Lower/OpenACC/acc-loop-exit.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-exit.f90
@@ -16,8 +16,8 @@ end
 ! CHECK-LABEL: func.func @_QPsub1
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[EXIT_COND:.*]] = acc.loop
+! CHECK:   %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: ^bb{{.*}}:
 ! CHECK: ^bb{{.*}}:
 ! CHECK:   %[[LOAD_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32> 
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 5ca08a3..d37eb8d 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -426,7 +426,7 @@ end
 ! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFacc_private_useEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.parallel
 ! CHECK: %[[PRIV_I:.*]] = acc.private varPtr(%[[DECL_I]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[DECL_PRIV_I:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFacc_private_useEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop {{.*}} private(@privatization_ref_i32 -> %[[PRIV_I]] : !fir.ref<i32>) control(%[[IV0:.*]] : i32) = (%c1{{.*}} : i32) to (%c10{{.*}} : i32) step (%c1{{.*}} : i32)
+! CHECK:   %[[DECL_PRIV_I:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFacc_private_useEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:   fir.store %[[IV0]] to %[[DECL_PRIV_I]]#0 : !fir.ref<i32>
 ! CHECK:   %{{.*}} = fir.load %[[DECL_PRIV_I]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
index a75a022..eaf734f 100644
--- a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
+++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
@@ -19,8 +19,8 @@ subroutine basic_do_loop()
 
 ! CHECK: acc.kernels {
 ! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
@@ -48,8 +48,8 @@ subroutine basic_do_concurrent()
 
 ! CHECK: acc.kernels {
 ! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrentEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrentEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
@@ -77,8 +77,8 @@ subroutine basic_do_loop_parallel()
 
 ! CHECK: acc.parallel {
 ! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loop_parallelEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loop_parallelEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
@@ -106,8 +106,8 @@ subroutine basic_do_loop_serial()
 
 ! CHECK: acc.serial {
 ! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loop_serialEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_loop_serialEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
@@ -135,8 +135,8 @@ subroutine basic_do_concurrent_parallel()
 
 ! CHECK: acc.parallel {
 ! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrent_parallelEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrent_parallelEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
@@ -164,8 +164,8 @@ subroutine basic_do_concurrent_serial()
 
 ! CHECK: acc.serial {
 ! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrent_serialEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFbasic_do_concurrent_serialEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
@@ -195,10 +195,10 @@ subroutine multi_dimension_do_concurrent()
 ! CHECK-DAG: %[[PRIVATE_I:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
 ! CHECK-DAG: %[[PRIVATE_J:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
 ! CHECK-DAG: %[[PRIVATE_K:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "k"}
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[PRIVATE_J]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[PRIVATE_K]] : !fir.ref<i32>) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
 ! CHECK-DAG: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFmulti_dimension_do_concurrentEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-DAG: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFmulti_dimension_do_concurrentEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-DAG: %[[PRIVATE_K_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFmulti_dimension_do_concurrentEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[PRIVATE_J]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[PRIVATE_K]] : !fir.ref<i32>) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_K_DECLARE]]#0 : !fir.ref<i32>
@@ -235,12 +235,12 @@ subroutine nested_do_loops()
 
 ! CHECK: acc.kernels {
 ! CHECK-DAG: %[[PRIVATE_I:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK-DAG: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFnested_do_loopsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK-DAG: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFnested_do_loopsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK-DAG: %[[PRIVATE_J:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
-! CHECK-DAG: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFnested_do_loopsEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_J]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK-DAG: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFnested_do_loopsEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
@@ -272,8 +272,8 @@ subroutine variable_bounds_and_step(n, start_val, step_val)
 
 ! CHECK: acc.kernels {
 ! CHECK: %[[PRIVATE_IV:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFvariable_bounds_and_stepEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_IV]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_IV]] {uniq_name = "_QFvariable_bounds_and_stepEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_DECLARE]]#0 : !fir.ref<i32>
@@ -315,22 +315,22 @@ subroutine different_iv_types()
 
 ! CHECK: acc.kernels {
 ! CHECK: %[[PRIVATE_I8:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "i8"}
-! CHECK: %[[PRIVATE_I8_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I8]] {uniq_name = "_QFdifferent_iv_typesEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK: acc.loop private(@privatization_ref_i64 -> %[[PRIVATE_I8]] : !fir.ref<i64>) control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64)
+! CHECK: %[[PRIVATE_I8_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I8]] {uniq_name = "_QFdifferent_iv_typesEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_I8_DECLARE]]#0 : !fir.ref<i64>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_I8_DECLARE]]#0 : !fir.ref<i64>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_I8_DECLARE]]#0 : !fir.ref<i64>
 ! CHECK: acc.kernels {
 ! CHECK: %[[PRIVATE_I4:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i4"}
-! CHECK: %[[PRIVATE_I4_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I4]] {uniq_name = "_QFdifferent_iv_typesEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I4]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_I4_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I4]] {uniq_name = "_QFdifferent_iv_typesEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_I4_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_I4_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_I4_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: acc.kernels {
 ! CHECK: %[[PRIVATE_I2:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i16>) -> !fir.ref<i16> {implicit = true, name = "i2"}
-! CHECK: %[[PRIVATE_I2_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I2]] {uniq_name = "_QFdifferent_iv_typesEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
 ! CHECK: acc.loop private(@privatization_ref_i16 -> %[[PRIVATE_I2]] : !fir.ref<i16>) control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16)
+! CHECK: %[[PRIVATE_I2_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I2]] {uniq_name = "_QFdifferent_iv_typesEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_I2_DECLARE]]#0 : !fir.ref<i16>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_I2_DECLARE]]#0 : !fir.ref<i16>
 ! CHECK: %{{.*}} = fir.load %[[PRIVATE_I2_DECLARE]]#0 : !fir.ref<i16>
@@ -362,12 +362,12 @@ subroutine nested_loop_with_reduction(x, y)
 ! CHECK: %[[REDUCTION_X:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {name = "x"}
 ! CHECK: %[[REDUCTION_Y:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {name = "y"}
 ! CHECK: %[[PRIVATE_I:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFnested_loop_with_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_I]] : !fir.ref<i32>) reduction(@reduction_add_ref_i32 -> %[[REDUCTION_X]] : !fir.ref<i32>, @reduction_add_ref_i32 -> %[[REDUCTION_Y]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_I_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_I]] {uniq_name = "_QFnested_loop_with_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_I_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %[[PRIVATE_J:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
-! CHECK: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFnested_loop_with_reductionEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %[[PRIVATE_J]] : !fir.ref<i32>) control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: %[[PRIVATE_J_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_J]] {uniq_name = "_QFnested_loop_with_reductionEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.store %{{.*}} to %[[PRIVATE_J_DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: %{{.*}} = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : i32
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 72e201a..10379d7 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -17,12 +17,14 @@
 #include <__bit/countr.h>
 #include <__bit/invert_if.h>
 #include <__config>
+#include <__cstddef/size_t.h>
 #include <__functional/identity.h>
 #include <__fwd/bit_reference.h>
 #include <__iterator/segmented_iterator.h>
 #include <__string/constexpr_c_functions.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/invoke.h>
+#include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
 #include <__type_traits/is_integral.h>
 #include <__type_traits/is_signed.h>
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 2b246f8..74923ddb 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -44,6 +44,7 @@
 #include <__utility/forward.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
+#include <__utility/scope_guard.h>
 #include <__utility/swap.h>
 #include <__utility/try_key_extraction.h>
 #include <limits>
@@ -1317,23 +1318,14 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(__hash_table& __u,
     max_load_factor() = __u.max_load_factor();
     if (bucket_count() != 0) {
       __next_pointer __cache = __detach();
-#if _LIBCPP_HAS_EXCEPTIONS
-      try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-        const_iterator __i = __u.begin();
-        while (__cache != nullptr && __u.size() != 0) {
-          __assign_value(__cache->__upcast()->__get_value(), std::move(__u.remove(__i++)->__get_value()));
-          __next_pointer __next = __cache->__next_;
-          __node_insert_multi(__cache->__upcast());
-          __cache = __next;
-        }
-#if _LIBCPP_HAS_EXCEPTIONS
-      } catch (...) {
-        __deallocate_node(__cache);
-        throw;
+      auto __guard           = std::__make_scope_guard([&] { __deallocate_node(__cache); });
+      const_iterator __i     = __u.begin();
+      while (__cache != nullptr && __u.size() != 0) {
+        __assign_value(__cache->__upcast()->__get_value(), std::move(__u.remove(__i++)->__get_value()));
+        __next_pointer __next = __cache->__next_;
+        __node_insert_multi(__cache->__upcast());
+        __cache = __next;
       }
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      __deallocate_node(__cache);
     }
     const_iterator __i = __u.begin();
     while (__u.size() != 0)
@@ -1361,22 +1353,13 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_unique(_InputIterator __
 
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      for (; __cache != nullptr && __first != __last; ++__first) {
-        __assign_value(__cache->__upcast()->__get_value(), *__first);
-        __next_pointer __next = __cache->__next_;
-        __node_insert_unique(__cache->__upcast());
-        __cache = __next;
-      }
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __deallocate_node(__cache);
-      throw;
+    auto __guard           = std::__make_scope_guard([&] { __deallocate_node(__cache); });
+    for (; __cache != nullptr && __first != __last; ++__first) {
+      __assign_value(__cache->__upcast()->__get_value(), *__first);
+      __next_pointer __next = __cache->__next_;
+      __node_insert_unique(__cache->__upcast());
+      __cache = __next;
     }
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    __deallocate_node(__cache);
   }
   for (; __first != __last; ++__first)
     __emplace_unique(*__first);
@@ -1391,22 +1374,13 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __f
                 "__assign_multi may only be called with the containers value type or the nodes value type");
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      for (; __cache != nullptr && __first != __last; ++__first) {
-        __assign_value(__cache->__upcast()->__get_value(), *__first);
-        __next_pointer __next              = __cache->__next_;
-        __node_insert_multi(__cache->__upcast());
-        __cache = __next;
-      }
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __deallocate_node(__cache);
-      throw;
+    auto __guard           = std::__make_scope_guard([&] { __deallocate_node(__cache); });
+    for (; __cache != nullptr && __first != __last; ++__first) {
+      __assign_value(__cache->__upcast()->__get_value(), *__first);
+      __next_pointer __next = __cache->__next_;
+      __node_insert_multi(__cache->__upcast());
+      __cache = __next;
     }
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    __deallocate_node(__cache);
   }
   for (; __first != __last; ++__first)
     __emplace_multi(*__first);
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 0cbd995..e90db58 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -54,6 +54,7 @@
 #include <__type_traits/remove_extent.h>
 #include <__type_traits/remove_reference.h>
 #include <__utility/declval.h>
+#include <__utility/exception_guard.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
 #include <__utility/swap.h>
@@ -352,23 +353,16 @@ public:
 
   template <class _Yp, class _Dp, __enable_if_t<__shared_ptr_deleter_ctor_reqs<_Dp, _Yp, _Tp>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI shared_ptr(_Yp* __p, _Dp __d) : __ptr_(__p) {
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      typedef typename __shared_ptr_default_allocator<_Yp>::type _AllocT;
-      typedef __shared_ptr_pointer<_Yp*, _Dp, _AllocT> _CntrlBlk;
+    auto __guard = std::__make_exception_guard([&] { __d(__p); });
+    typedef typename __shared_ptr_default_allocator<_Yp>::type _AllocT;
+    typedef __shared_ptr_pointer<_Yp*, _Dp, _AllocT> _CntrlBlk;
 #ifndef _LIBCPP_CXX03_LANG
-      __cntrl_ = new _CntrlBlk(__p, std::move(__d), _AllocT());
+    __cntrl_ = new _CntrlBlk(__p, std::move(__d), _AllocT());
 #else
     __cntrl_ = new _CntrlBlk(__p, __d, _AllocT());
 #endif // not _LIBCPP_CXX03_LANG
-      __enable_weak_this(__p, __p);
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __d(__p);
-      throw;
-    }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+    __enable_weak_this(__p, __p);
+    __guard.__complete();
   }
 
   template <class _Yp,
@@ -376,28 +370,21 @@ public:
             class _Alloc,
             __enable_if_t<__shared_ptr_deleter_ctor_reqs<_Dp, _Yp, _Tp>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI shared_ptr(_Yp* __p, _Dp __d, _Alloc __a) : __ptr_(__p) {
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      typedef __shared_ptr_pointer<_Yp*, _Dp, _Alloc> _CntrlBlk;
-      typedef typename __allocator_traits_rebind<_Alloc, _CntrlBlk>::type _A2;
-      typedef __allocator_destructor<_A2> _D2;
-      _A2 __a2(__a);
-      unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
-      ::new ((void*)std::addressof(*__hold2.get()))
+    auto __guard = std::__make_exception_guard([&] { __d(__p); });
+    typedef __shared_ptr_pointer<_Yp*, _Dp, _Alloc> _CntrlBlk;
+    typedef typename __allocator_traits_rebind<_Alloc, _CntrlBlk>::type _A2;
+    typedef __allocator_destructor<_A2> _D2;
+    _A2 __a2(__a);
+    unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
+    ::new ((void*)std::addressof(*__hold2.get()))
 #ifndef _LIBCPP_CXX03_LANG
-          _CntrlBlk(__p, std::move(__d), __a);
+        _CntrlBlk(__p, std::move(__d), __a);
 #else
         _CntrlBlk(__p, __d, __a);
 #endif // not _LIBCPP_CXX03_LANG
-      __cntrl_ = std::addressof(*__hold2.release());
-      __enable_weak_this(__p, __p);
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __d(__p);
-      throw;
-    }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+    __cntrl_ = std::addressof(*__hold2.release());
+    __enable_weak_this(__p, __p);
+    __guard.__complete();
   }
 
   template <class _Dp>
@@ -406,22 +393,15 @@ public:
       _Dp __d,
       __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp>::value, __nullptr_sfinae_tag> = __nullptr_sfinae_tag())
       : __ptr_(nullptr) {
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      typedef typename __shared_ptr_default_allocator<_Tp>::type _AllocT;
-      typedef __shared_ptr_pointer<nullptr_t, _Dp, _AllocT> _CntrlBlk;
+    auto __guard = std::__make_exception_guard([&] { __d(__p); });
+    typedef typename __shared_ptr_default_allocator<_Tp>::type _AllocT;
+    typedef __shared_ptr_pointer<nullptr_t, _Dp, _AllocT> _CntrlBlk;
 #ifndef _LIBCPP_CXX03_LANG
-      __cntrl_ = new _CntrlBlk(__p, std::move(__d), _AllocT());
+    __cntrl_ = new _CntrlBlk(__p, std::move(__d), _AllocT());
 #else
     __cntrl_ = new _CntrlBlk(__p, __d, _AllocT());
 #endif // not _LIBCPP_CXX03_LANG
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __d(__p);
-      throw;
-    }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
   }
 
   template <class _Dp, class _Alloc>
@@ -431,27 +411,20 @@ public:
       _Alloc __a,
       __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp>::value, __nullptr_sfinae_tag> = __nullptr_sfinae_tag())
       : __ptr_(nullptr) {
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      typedef __shared_ptr_pointer<nullptr_t, _Dp, _Alloc> _CntrlBlk;
-      typedef typename __allocator_traits_rebind<_Alloc, _CntrlBlk>::type _A2;
-      typedef __allocator_destructor<_A2> _D2;
-      _A2 __a2(__a);
-      unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
-      ::new ((void*)std::addressof(*__hold2.get()))
+    auto __guard = std::__make_exception_guard([&] { __d(__p); });
+    typedef __shared_ptr_pointer<nullptr_t, _Dp, _Alloc> _CntrlBlk;
+    typedef typename __allocator_traits_rebind<_Alloc, _CntrlBlk>::type _A2;
+    typedef __allocator_destructor<_A2> _D2;
+    _A2 __a2(__a);
+    unique_ptr<_CntrlBlk, _D2> __hold2(__a2.allocate(1), _D2(__a2, 1));
+    ::new ((void*)std::addressof(*__hold2.get()))
 #ifndef _LIBCPP_CXX03_LANG
-          _CntrlBlk(__p, std::move(__d), __a);
+        _CntrlBlk(__p, std::move(__d), __a);
 #else
         _CntrlBlk(__p, __d, __a);
 #endif // not _LIBCPP_CXX03_LANG
-      __cntrl_ = std::addressof(*__hold2.release());
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __d(__p);
-      throw;
-    }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+    __cntrl_ = std::addressof(*__hold2.release());
+    __guard.__complete();
   }
 
   template <class _Yp>
diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index e802366..34d065d 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -61,17 +61,10 @@ template <class _ValueType, class _InputIterator, class _Sentinel1, class _Forwa
 inline _LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator> __uninitialized_copy(
     _InputIterator __ifirst, _Sentinel1 __ilast, _ForwardIterator __ofirst, _EndPredicate __stop_copying) {
   _ForwardIterator __idx = __ofirst;
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif
-    for (; __ifirst != __ilast && !__stop_copying(__idx); ++__ifirst, (void)++__idx)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(*__ifirst);
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__ofirst, __idx);
-    throw;
-  }
-#endif
+  auto __guard           = std::__make_exception_guard([&] { std::__destroy(__ofirst, __idx); });
+  for (; __ifirst != __ilast && !__stop_copying(__idx); ++__ifirst, (void)++__idx)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(*__ifirst);
+  __guard.__complete();
 
   return pair<_InputIterator, _ForwardIterator>(std::move(__ifirst), std::move(__idx));
 }
@@ -91,17 +84,10 @@ template <class _ValueType, class _InputIterator, class _Size, class _ForwardIte
 inline _LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator>
 __uninitialized_copy_n(_InputIterator __ifirst, _Size __n, _ForwardIterator __ofirst, _EndPredicate __stop_copying) {
   _ForwardIterator __idx = __ofirst;
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif
-    for (; __n > 0 && !__stop_copying(__idx); ++__ifirst, (void)++__idx, (void)--__n)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(*__ifirst);
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__ofirst, __idx);
-    throw;
-  }
-#endif
+  auto __guard           = std::__make_exception_guard([&] { std::__destroy(__ofirst, __idx); });
+  for (; __n > 0 && !__stop_copying(__idx); ++__ifirst, (void)++__idx, (void)--__n)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(*__ifirst);
+  __guard.__complete();
 
   return pair<_InputIterator, _ForwardIterator>(std::move(__ifirst), std::move(__idx));
 }
@@ -121,17 +107,10 @@ template <class _ValueType, class _ForwardIterator, class _Sentinel, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 __uninitialized_fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __x) {
   _ForwardIterator __idx = __first;
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif
-    for (; __idx != __last; ++__idx)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__x);
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__first, __idx);
-    throw;
-  }
-#endif
+  auto __guard           = std::__make_exception_guard([&] { std::__destroy(__first, __idx); });
+  for (; __idx != __last; ++__idx)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__x);
+  __guard.__complete();
 
   return __idx;
 }
@@ -149,17 +128,10 @@ template <class _ValueType, class _ForwardIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 __uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x) {
   _ForwardIterator __idx = __first;
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif
-    for (; __n > 0; ++__idx, (void)--__n)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__x);
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__first, __idx);
-    throw;
-  }
-#endif
+  auto __guard           = std::__make_exception_guard([&] { std::__destroy(__first, __idx); });
+  for (; __n > 0; ++__idx, (void)--__n)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__x);
+  __guard.__complete();
 
   return __idx;
 }
@@ -178,18 +150,11 @@ uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x) {
 template <class _ValueType, class _ForwardIterator, class _Sentinel>
 inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 __uninitialized_default_construct(_ForwardIterator __first, _Sentinel __last) {
-  auto __idx = __first;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif
-    for (; __idx != __last; ++__idx)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__first, __idx);
-    throw;
-  }
-#  endif
+  auto __idx   = __first;
+  auto __guard = std::__make_exception_guard([&] { std::__destroy(__first, __idx); });
+  for (; __idx != __last; ++__idx)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType;
+  __guard.__complete();
 
   return __idx;
 }
@@ -205,17 +170,10 @@ inline _LIBCPP_HIDE_FROM_ABI void uninitialized_default_construct(_ForwardIterat
 template <class _ValueType, class _ForwardIterator, class _Size>
 inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator __uninitialized_default_construct_n(_ForwardIterator __first, _Size __n) {
   auto __idx = __first;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif
-    for (; __n > 0; ++__idx, (void)--__n)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__first, __idx);
-    throw;
-  }
-#  endif
+  auto __guard = std::__make_exception_guard([&] { std::__destroy(__first, __idx); });
+  for (; __n > 0; ++__idx, (void)--__n)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType;
+  __guard.__complete();
 
   return __idx;
 }
@@ -231,18 +189,11 @@ inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator uninitialized_default_construct_n(
 template <class _ValueType, class _ForwardIterator, class _Sentinel>
 inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 __uninitialized_value_construct(_ForwardIterator __first, _Sentinel __last) {
-  auto __idx = __first;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif
-    for (; __idx != __last; ++__idx)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType();
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__first, __idx);
-    throw;
-  }
-#  endif
+  auto __idx   = __first;
+  auto __guard = std::__make_exception_guard([&] { std::__destroy(__first, __idx); });
+  for (; __idx != __last; ++__idx)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType();
+  __guard.__complete();
 
   return __idx;
 }
@@ -258,17 +209,10 @@ inline _LIBCPP_HIDE_FROM_ABI void uninitialized_value_construct(_ForwardIterator
 template <class _ValueType, class _ForwardIterator, class _Size>
 inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator __uninitialized_value_construct_n(_ForwardIterator __first, _Size __n) {
   auto __idx = __first;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif
-    for (; __n > 0; ++__idx, (void)--__n)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType();
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__first, __idx);
-    throw;
-  }
-#  endif
+  auto __guard = std::__make_exception_guard([&] { std::__destroy(__first, __idx); });
+  for (; __n > 0; ++__idx, (void)--__n)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType();
+  __guard.__complete();
 
   return __idx;
 }
@@ -293,19 +237,12 @@ inline _LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator> __uninitiali
     _ForwardIterator __ofirst,
     _EndPredicate __stop_moving,
     _IterMove __iter_move) {
-  auto __idx = __ofirst;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif
-    for (; __ifirst != __ilast && !__stop_moving(__idx); ++__idx, (void)++__ifirst) {
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__iter_move(__ifirst));
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__ofirst, __idx);
-    throw;
+  auto __idx   = __ofirst;
+  auto __guard = std::__make_exception_guard([&] { std::__destroy(__ofirst, __idx); });
+  for (; __ifirst != __ilast && !__stop_moving(__idx); ++__idx, (void)++__ifirst) {
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__iter_move(__ifirst));
   }
-#  endif
+  __guard.__complete();
 
   return {std::move(__ifirst), std::move(__idx)};
 }
@@ -331,18 +268,11 @@ template <class _ValueType,
           class _IterMove>
 inline _LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _ForwardIterator> __uninitialized_move_n(
     _InputIterator __ifirst, _Size __n, _ForwardIterator __ofirst, _EndPredicate __stop_moving, _IterMove __iter_move) {
-  auto __idx = __ofirst;
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif
-    for (; __n > 0 && !__stop_moving(__idx); ++__idx, (void)++__ifirst, --__n)
-      ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__iter_move(__ifirst));
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    std::__destroy(__ofirst, __idx);
-    throw;
-  }
-#  endif
+  auto __idx   = __ofirst;
+  auto __guard = std::__make_exception_guard([&] { std::__destroy(__ofirst, __idx); });
+  for (; __n > 0 && !__stop_moving(__idx); ++__idx, (void)++__ifirst, --__n)
+    ::new (static_cast<void*>(std::addressof(*__idx))) _ValueType(__iter_move(__ifirst));
+  __guard.__complete();
 
   return {std::move(__ifirst), std::move(__idx)};
 }
diff --git a/libcxx/include/__utility/scope_guard.h b/libcxx/include/__utility/scope_guard.h
index 3972102..db4f0e4 100644
--- a/libcxx/include/__utility/scope_guard.h
+++ b/libcxx/include/__utility/scope_guard.h
@@ -43,6 +43,8 @@ public:
 #endif
 };
 
+_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__scope_guard);
+
 template <class _Func>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __scope_guard<_Func> __make_scope_guard(_Func __func) {
   return __scope_guard<_Func>(std::move(__func));
diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h
index 7595427..6cb8f2a 100644
--- a/libcxx/include/__vector/vector_bool.h
+++ b/libcxx/include/__vector/vector_bool.h
@@ -963,21 +963,14 @@ vector<bool, _Allocator>::__insert_with_sentinel(const_iterator __position, _Inp
   }
   vector __v(get_allocator());
   if (__first != __last) {
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      __v.__assign_with_sentinel(std::move(__first), std::move(__last));
-      difference_type __old_size = static_cast<difference_type>(__old_end - begin());
-      difference_type __old_p    = __p - begin();
-      reserve(__recommend(size() + __v.size()));
-      __p       = begin() + __old_p;
-      __old_end = begin() + __old_size;
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      erase(__old_end, end());
-      throw;
-    }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard = std::__make_exception_guard([&] { erase(__old_end, end()); });
+    __v.__assign_with_sentinel(std::move(__first), std::move(__last));
+    difference_type __old_size = static_cast<difference_type>(__old_end - begin());
+    difference_type __old_p    = __p - begin();
+    reserve(__recommend(size() + __v.size()));
+    __p       = begin() + __old_p;
+    __old_end = begin() + __old_size;
+    __guard.__complete();
   }
   __p = std::rotate(__p, __old_end, end());
   insert(__p, __v.begin(), __v.end());
diff --git a/libcxx/include/deque b/libcxx/include/deque
index c8e1025..3e7ee8d85 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -235,6 +235,7 @@ template <class T, class Allocator, class Predicate>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_relocatable.h>
 #  include <__type_traits/type_identity.h>
+#  include <__utility/exception_guard.h>
 #  include <__utility/forward.h>
 #  include <__utility/move.h>
 #  include <__utility/pair.h>
@@ -2135,22 +2136,17 @@ void deque<_Tp, _Allocator>::__add_front_capacity(size_type __n) {
     size_type __ds = (__nb + __back_capacity) * __block_size - __map_.empty();
     __split_buffer<pointer, __pointer_allocator&> __buf(
         std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()), 0, __map_.__get_allocator());
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (; __nb > 0; --__nb) {
-        __buf.emplace_back(__alloc_traits::allocate(__a, __block_size));
-        // ASan: this is empty container, we have to poison whole block
-        __annotate_poison_block(std::__to_address(__buf.back()), std::__to_address(__buf.back() + __block_size));
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard = std::__make_exception_guard([&] {
       __annotate_delete();
       for (__map_pointer __i = __buf.begin(); __i != __buf.end(); ++__i)
         __alloc_traits::deallocate(__a, *__i, __block_size);
-      throw;
+    });
+    for (; __nb > 0; --__nb) {
+      __buf.emplace_back(__alloc_traits::allocate(__a, __block_size));
+      // ASan: this is empty container, we have to poison whole block
+      __annotate_poison_block(std::__to_address(__buf.back()), std::__to_address(__buf.back() + __block_size));
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
     for (; __back_capacity > 0; --__back_capacity) {
       __buf.emplace_back(__map_.back());
       __map_.pop_back();
@@ -2254,22 +2250,17 @@ void deque<_Tp, _Allocator>::__add_back_capacity(size_type __n) {
         std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()),
         __map_.size() - __front_capacity,
         __map_.__get_allocator());
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (; __nb > 0; --__nb) {
-        __buf.emplace_back(__alloc_traits::allocate(__a, __block_size));
-        // ASan: this is an empty container, we have to poison the whole block
-        __annotate_poison_block(std::__to_address(__buf.back()), std::__to_address(__buf.back() + __block_size));
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard = std::__make_exception_guard([&] {
       __annotate_delete();
       for (__map_pointer __i = __buf.begin(); __i != __buf.end(); ++__i)
         __alloc_traits::deallocate(__a, *__i, __block_size);
-      throw;
+    });
+    for (; __nb > 0; --__nb) {
+      __buf.emplace_back(__alloc_traits::allocate(__a, __block_size));
+      // ASan: this is an empty container, we have to poison the whole block
+      __annotate_poison_block(std::__to_address(__buf.back()), std::__to_address(__buf.back() + __block_size));
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
     for (; __front_capacity > 0; --__front_capacity) {
       __buf.emplace_back(__map_.front());
       __map_.pop_front();
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 05f22e3..df7da20 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -235,6 +235,7 @@ template <class T, class Allocator, class Predicate>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/remove_cv.h>
 #  include <__type_traits/type_identity.h>
+#  include <__utility/exception_guard.h>
 #  include <__utility/forward.h>
 #  include <__utility/move.h>
 #  include <__utility/swap.h>
@@ -1180,22 +1181,17 @@ forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Ar
   if (__n > 0) {
     __node_pointer __first = this->__create_node(/* next = */ nullptr, std::forward<_Args>(__args)...);
     __node_pointer __last  = __first;
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (--__n; __n != 0; --__n, __last = __last->__next_) {
-        __last->__next_ = this->__create_node(/* next = */ nullptr, std::forward<_Args>(__args)...);
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard           = std::__make_exception_guard([&] {
       while (__first != nullptr) {
         __node_pointer __next = __first->__next_;
         this->__delete_node(__first);
         __first = __next;
       }
-      throw;
+    });
+    for (--__n; __n != 0; --__n, __last = __last->__next_) {
+      __last->__next_ = this->__create_node(/* next = */ nullptr, std::forward<_Args>(__args)...);
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
     __r             = std::__static_fancy_pointer_cast<__begin_node_pointer>(__last);
@@ -1220,22 +1216,17 @@ forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _Inp
     __node_pointer __first = this->__create_node(/* next = */ nullptr, *__f);
     __node_pointer __last  = __first;
 
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (++__f; __f != __l; ++__f, ((void)(__last = __last->__next_))) {
-        __last->__next_ = this->__create_node(/* next = */ nullptr, *__f);
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard = std::__make_exception_guard([&] {
       while (__first != nullptr) {
         __node_pointer __next = __first->__next_;
         this->__delete_node(__first);
         __first = __next;
       }
-      throw;
+    });
+    for (++__f; __f != __l; ++__f, ((void)(__last = __last->__next_))) {
+      __last->__next_ = this->__create_node(/* next = */ nullptr, *__f);
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
 
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
diff --git a/libcxx/include/future b/libcxx/include/future
index 3df9dc9..4b7c098 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -403,6 +403,7 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #    include <__type_traits/strip_signature.h>
 #    include <__type_traits/underlying_type.h>
 #    include <__utility/auto_cast.h>
+#    include <__utility/exception_guard.h>
 #    include <__utility/forward.h>
 #    include <__utility/move.h>
 #    include <__utility/swap.h>
@@ -1815,16 +1816,9 @@ template <class _Rp, class _Fp>
 _LIBCPP_HIDE_FROM_ABI future<_Rp> __make_async_assoc_state(_Fp&& __f) {
   unique_ptr<__async_assoc_state<_Rp, _Fp>, __release_shared_count> __h(
       new __async_assoc_state<_Rp, _Fp>(std::forward<_Fp>(__f)));
-#    if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#    endif
-    std::thread(&__async_assoc_state<_Rp, _Fp>::__execute, __h.get()).detach();
-#    if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    __h->__make_ready();
-    throw;
-  }
-#    endif
+  auto __guard = std::__make_exception_guard([&] { __h->__make_ready(); });
+  std::thread(&__async_assoc_state<_Rp, _Fp>::__execute, __h.get()).detach();
+  __guard.__complete();
   return future<_Rp>(__h.get());
 }
 
diff --git a/libcxx/include/list b/libcxx/include/list
index 996dbfd..c5c2a85 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -237,6 +237,7 @@ template <class T, class Allocator, class Predicate>
 #  include <__type_traits/is_pointer.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/type_identity.h>
+#  include <__utility/exception_guard.h>
 #  include <__utility/forward.h>
 #  include <__utility/move.h>
 #  include <__utility/swap.h>
@@ -1233,14 +1234,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& _
     ++__ds;
     __r          = iterator(__node->__as_link());
     iterator __e = __r;
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (--__n; __n != 0; --__n, (void)++__e, ++__ds) {
-        __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr, __x)->__as_link();
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard = std::__make_exception_guard([&] {
       while (true) {
         __base_pointer __prev    = __e.__ptr_->__prev_;
         __node_pointer __current = __e.__ptr_->__as_node();
@@ -1249,9 +1243,11 @@ list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& _
           break;
         __e = iterator(__prev);
       }
-      throw;
+    });
+    for (--__n; __n != 0; --__n, (void)++__e, ++__ds) {
+      __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr, __x)->__as_link();
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
     __link_nodes(__p.__ptr_, __r.__ptr_, __e.__ptr_);
     this->__size_ += __ds;
   }
@@ -1276,14 +1272,7 @@ list<_Tp, _Alloc>::__insert_with_sentinel(const_iterator __p, _Iterator __f, _Se
     ++__ds;
     __r          = iterator(__node->__as_link());
     iterator __e = __r;
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (++__f; __f != __l; ++__f, (void)++__e, ++__ds) {
-        __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr, *__f)->__as_link();
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard = std::__make_exception_guard([&] {
       while (true) {
         __base_pointer __prev    = __e.__ptr_->__prev_;
         __node_pointer __current = __e.__ptr_->__as_node();
@@ -1292,9 +1281,11 @@ list<_Tp, _Alloc>::__insert_with_sentinel(const_iterator __p, _Iterator __f, _Se
           break;
         __e = iterator(__prev);
       }
-      throw;
+    });
+    for (++__f; __f != __l; ++__f, (void)++__e, ++__ds) {
+      __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr, *__f)->__as_link();
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
     __link_nodes(__p.__ptr_, __r.__ptr_, __e.__ptr_);
     this->__size_ += __ds;
   }
@@ -1452,14 +1443,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::resize(size_type __n) {
     ++__ds;
     iterator __r = iterator(__node->__as_link());
     iterator __e = __r;
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (--__n; __n != 0; --__n, (void)++__e, ++__ds) {
-        __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr)->__as_link();
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard = std::__make_exception_guard([&] {
       while (true) {
         __base_pointer __prev    = __e.__ptr_->__prev_;
         __node_pointer __current = __e.__ptr_->__as_node();
@@ -1468,9 +1452,11 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::resize(size_type __n) {
           break;
         __e = iterator(__prev);
       }
-      throw;
+    });
+    for (--__n; __n != 0; --__n, (void)++__e, ++__ds) {
+      __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr)->__as_link();
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
     __link_nodes_at_back(__r.__ptr_, __e.__ptr_);
     this->__size_ += __ds;
   }
@@ -1488,14 +1474,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::resize(size_type __n, cons
     __base_pointer __nl = __node->__as_link();
     iterator __r        = iterator(__nl);
     iterator __e        = __r;
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (--__n; __n != 0; --__n, (void)++__e, ++__ds) {
-        __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr, __x)->__as_link();
-      }
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
+    auto __guard        = std::__make_exception_guard([&] {
       while (true) {
         __base_pointer __prev    = __e.__ptr_->__prev_;
         __node_pointer __current = __e.__ptr_->__as_node();
@@ -1504,9 +1483,11 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::resize(size_type __n, cons
           break;
         __e = iterator(__prev);
       }
-      throw;
+    });
+    for (--__n; __n != 0; --__n, (void)++__e, ++__ds) {
+      __e.__ptr_->__next_ = this->__create_node(/* prev = */ __e.__ptr_, /* next = */ nullptr, __x)->__as_link();
     }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    __guard.__complete();
     __link_nodes(__base::__end_as_link(), __r.__ptr_, __e.__ptr_);
     this->__size_ += __ds;
   }
diff --git a/libcxx/include/string b/libcxx/include/string
index 363f27a..f5e05d8 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -640,6 +640,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__type_traits/is_trivially_relocatable.h>
 #  include <__type_traits/remove_cvref.h>
 #  include <__utility/default_three_way_comparator.h>
+#  include <__utility/exception_guard.h>
 #  include <__utility/forward.h>
 #  include <__utility/is_pointer_in_range.h>
 #  include <__utility/move.h>
@@ -2652,17 +2653,10 @@ basic_string<_CharT, _Traits, _Allocator>::__init_with_sentinel(_InputIterator _
   __rep_ = __rep();
   __annotate_new(0);
 
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    for (; __first != __last; ++__first)
-      push_back(*__first);
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    __reset_internal_buffer();
-    throw;
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+  auto __guard = std::__make_exception_guard([this] { __reset_internal_buffer(); });
+  for (; __first != __last; ++__first)
+    push_back(*__first);
+  __guard.__complete();
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -2679,17 +2673,10 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 basic_string<_CharT, _Traits, _Allocator>::__init_with_size(_InputIterator __first, _Sentinel __last, size_type __sz) {
   pointer __p = __init_internal_buffer(__sz);
 
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    auto __end = __copy_non_overlapping_range(std::move(__first), std::move(__last), std::__to_address(__p));
-    traits_type::assign(*__end, value_type());
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    __reset_internal_buffer();
-    throw;
-  }
-#  endif                       // _LIBCPP_HAS_EXCEPTIONS
+  auto __guard = std::__make_exception_guard([this] { __reset_internal_buffer(); });
+  auto __end   = __copy_non_overlapping_range(std::move(__first), std::move(__last), std::__to_address(__p));
+  traits_type::assign(*__end, value_type());
+  __guard.__complete();
 }
 
 template <class _CharT, class _Traits, class _Allocator>
diff --git a/libcxx/include/valarray b/libcxx/include/valarray
index 96501ca..215811d 100644
--- a/libcxx/include/valarray
+++ b/libcxx/include/valarray
@@ -362,6 +362,7 @@ template <class T> unspecified2 end(const valarray<T>& v);
 #  include <__memory/uninitialized_algorithms.h>
 #  include <__type_traits/decay.h>
 #  include <__type_traits/remove_reference.h>
+#  include <__utility/exception_guard.h>
 #  include <__utility/move.h>
 #  include <__utility/swap.h>
 #  include <cmath>
@@ -1992,17 +1993,10 @@ template <class _Tp>
 inline valarray<_Tp>::valarray(size_t __n) : __begin_(nullptr), __end_(nullptr) {
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (size_t __n_left = __n; __n_left; --__n_left, ++__end_)
-        ::new ((void*)__end_) value_type();
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    for (size_t __n_left = __n; __n_left; --__n_left, ++__end_)
+      ::new ((void*)__end_) value_type();
+    __guard.__complete();
   }
 }
 
@@ -2015,17 +2009,10 @@ template <class _Tp>
 valarray<_Tp>::valarray(const value_type* __p, size_t __n) : __begin_(nullptr), __end_(nullptr) {
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (size_t __n_left = __n; __n_left; ++__end_, ++__p, --__n_left)
-        ::new ((void*)__end_) value_type(*__p);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    for (size_t __n_left = __n; __n_left; ++__end_, ++__p, --__n_left)
+      ::new ((void*)__end_) value_type(*__p);
+    __guard.__complete();
   }
 }
 
@@ -2033,17 +2020,10 @@ template <class _Tp>
 valarray<_Tp>::valarray(const valarray& __v) : __begin_(nullptr), __end_(nullptr) {
   if (__v.size()) {
     __begin_ = __end_ = allocator<value_type>().allocate(__v.size());
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (value_type* __p = __v.__begin_; __p != __v.__end_; ++__end_, ++__p)
-        ::new ((void*)__end_) value_type(*__p);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__v.size());
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__v.size()); });
+    for (value_type* __p = __v.__begin_; __p != __v.__end_; ++__end_, ++__p)
+      ::new ((void*)__end_) value_type(*__p);
+    __guard.__complete();
   }
 }
 
@@ -2059,18 +2039,11 @@ valarray<_Tp>::valarray(initializer_list<value_type> __il) : __begin_(nullptr),
   const size_t __n = __il.size();
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#    if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#    endif // _LIBCPP_HAS_EXCEPTIONS
-      size_t __n_left = __n;
-      for (const value_type* __p = __il.begin(); __n_left; ++__end_, ++__p, --__n_left)
-        ::new ((void*)__end_) value_type(*__p);
-#    if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#    endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    size_t __n_left   = __n;
+    for (const value_type* __p = __il.begin(); __n_left; ++__end_, ++__p, --__n_left)
+      ::new ((void*)__end_) value_type(*__p);
+    __guard.__complete();
   }
 }
 
@@ -2081,18 +2054,11 @@ valarray<_Tp>::valarray(const slice_array<value_type>& __sa) : __begin_(nullptr)
   const size_t __n = __sa.__size_;
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      size_t __n_left = __n;
-      for (const value_type* __p = __sa.__vp_; __n_left; ++__end_, __p += __sa.__stride_, --__n_left)
-        ::new ((void*)__end_) value_type(*__p);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    size_t __n_left   = __n;
+    for (const value_type* __p = __sa.__vp_; __n_left; ++__end_, __p += __sa.__stride_, --__n_left)
+      ::new ((void*)__end_) value_type(*__p);
+    __guard.__complete();
   }
 }
 
@@ -2101,19 +2067,12 @@ valarray<_Tp>::valarray(const gslice_array<value_type>& __ga) : __begin_(nullptr
   const size_t __n = __ga.__1d_.size();
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      typedef const size_t* _Ip;
-      const value_type* __s = __ga.__vp_;
-      for (_Ip __i = __ga.__1d_.__begin_, __e = __ga.__1d_.__end_; __i != __e; ++__i, ++__end_)
-        ::new ((void*)__end_) value_type(__s[*__i]);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    typedef const size_t* _Ip;
+    const value_type* __s = __ga.__vp_;
+    for (_Ip __i = __ga.__1d_.__begin_, __e = __ga.__1d_.__end_; __i != __e; ++__i, ++__end_)
+      ::new ((void*)__end_) value_type(__s[*__i]);
+    __guard.__complete();
   }
 }
 
@@ -2122,19 +2081,12 @@ valarray<_Tp>::valarray(const mask_array<value_type>& __ma) : __begin_(nullptr),
   const size_t __n = __ma.__1d_.size();
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      typedef const size_t* _Ip;
-      const value_type* __s = __ma.__vp_;
-      for (_Ip __i = __ma.__1d_.__begin_, __e = __ma.__1d_.__end_; __i != __e; ++__i, ++__end_)
-        ::new ((void*)__end_) value_type(__s[*__i]);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    typedef const size_t* _Ip;
+    const value_type* __s = __ma.__vp_;
+    for (_Ip __i = __ma.__1d_.__begin_, __e = __ma.__1d_.__end_; __i != __e; ++__i, ++__end_)
+      ::new ((void*)__end_) value_type(__s[*__i]);
+    __guard.__complete();
   }
 }
 
@@ -2143,19 +2095,12 @@ valarray<_Tp>::valarray(const indirect_array<value_type>& __ia) : __begin_(nullp
   const size_t __n = __ia.__1d_.size();
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      typedef const size_t* _Ip;
-      const value_type* __s = __ia.__vp_;
-      for (_Ip __i = __ia.__1d_.__begin_, __e = __ia.__1d_.__end_; __i != __e; ++__i, ++__end_)
-        ::new ((void*)__end_) value_type(__s[*__i]);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    typedef const size_t* _Ip;
+    const value_type* __s = __ia.__vp_;
+    for (_Ip __i = __ia.__1d_.__begin_, __e = __ia.__1d_.__end_; __i != __e; ++__i, ++__end_)
+      ::new ((void*)__end_) value_type(__s[*__i]);
+    __guard.__complete();
   }
 }
 
@@ -2644,17 +2589,10 @@ void valarray<_Tp>::resize(size_t __n, value_type __x) {
   __clear(size());
   if (__n) {
     __begin_ = __end_ = allocator<value_type>().allocate(__n);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      for (size_t __n_left = __n; __n_left; --__n_left, ++__end_)
-        ::new ((void*)__end_) value_type(__x);
-#  if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      __clear(__n);
-      throw;
-    }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
+    auto __guard      = std::__make_exception_guard([&] { __clear(__n); });
+    for (size_t __n_left = __n; __n_left; --__n_left, ++__end_)
+      ::new ((void*)__end_) value_type(__x);
+    __guard.__complete();
   }
 }
 
diff --git a/libcxx/src/filesystem/error.h b/libcxx/src/filesystem/error.h
index 52a18b2..db5d1ae 100644
--- a/libcxx/src/filesystem/error.h
+++ b/libcxx/src/filesystem/error.h
@@ -128,17 +128,8 @@ struct ErrorHandler {
   T report(const error_code& ec, const char* msg, ...) const {
     va_list ap;
     va_start(ap, msg);
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      report_impl(ec, msg, ap);
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      va_end(ap);
-      throw;
-    }
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    va_end(ap);
+    __scope_guard guard([&] { va_end(ap); });
+    report_impl(ec, msg, ap);
     return error_value<T>();
   }
 
@@ -148,17 +139,8 @@ struct ErrorHandler {
   T report(errc const& err, const char* msg, ...) const {
     va_list ap;
     va_start(ap, msg);
-#if _LIBCPP_HAS_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-      report_impl(make_error_code(err), msg, ap);
-#if _LIBCPP_HAS_EXCEPTIONS
-    } catch (...) {
-      va_end(ap);
-      throw;
-    }
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    va_end(ap);
+    __scope_guard guard([&] { va_end(ap); });
+    report_impl(make_error_code(err), msg, ap);
     return error_value<T>();
   }
 
diff --git a/libcxx/src/filesystem/format_string.h b/libcxx/src/filesystem/format_string.h
index e91475e..8d17b02 100644
--- a/libcxx/src/filesystem/format_string.h
+++ b/libcxx/src/filesystem/format_string.h
@@ -11,6 +11,7 @@
 
 #include <__assert>
 #include <__config>
+#include <__utility/scope_guard.h>
 #include <array>
 #include <cstdarg>
 #include <cstddef>
@@ -55,17 +56,8 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) string format_string(const cha
   string ret;
   va_list ap;
   va_start(ap, msg);
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    ret = detail::vformat_string(msg, ap);
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    va_end(ap);
-    throw;
-  }
-#endif // _LIBCPP_HAS_EXCEPTIONS
-  va_end(ap);
+  __scope_guard guard([&] { va_end(ap); });
+  ret = detail::vformat_string(msg, ap);
   return ret;
 }
 
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index da73586..0f695d4 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -215,63 +215,58 @@ locale::__imp::__imp(size_t refs) : facet(refs), facets_(N), name_("C") {
 }
 
 locale::__imp::__imp(const string& name, size_t refs) : facet(refs), facets_(N), name_(name) {
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    facets_ = locale::classic().__locale_->facets_;
+  __exception_guard guard([&] {
     for (unsigned i = 0; i < facets_.size(); ++i)
       if (facets_[i])
-        facets_[i]->__add_shared();
-    install(new collate_byname<char>(name_));
+        facets_[i]->__release_shared();
+  });
+  facets_ = locale::classic().__locale_->facets_;
+  for (unsigned i = 0; i < facets_.size(); ++i)
+    if (facets_[i])
+      facets_[i]->__add_shared();
+  install(new collate_byname<char>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new collate_byname<wchar_t>(name_));
+  install(new collate_byname<wchar_t>(name_));
 #endif
-    install(new ctype_byname<char>(name_));
+  install(new ctype_byname<char>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new ctype_byname<wchar_t>(name_));
+  install(new ctype_byname<wchar_t>(name_));
 #endif
-    install(new codecvt_byname<char, char, mbstate_t>(name_));
+  install(new codecvt_byname<char, char, mbstate_t>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new codecvt_byname<wchar_t, char, mbstate_t>(name_));
+  install(new codecvt_byname<wchar_t, char, mbstate_t>(name_));
 #endif
-    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-    install(new codecvt_byname<char16_t, char, mbstate_t>(name_));
-    install(new codecvt_byname<char32_t, char, mbstate_t>(name_));
-    _LIBCPP_SUPPRESS_DEPRECATED_POP
+  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+  install(new codecvt_byname<char16_t, char, mbstate_t>(name_));
+  install(new codecvt_byname<char32_t, char, mbstate_t>(name_));
+  _LIBCPP_SUPPRESS_DEPRECATED_POP
 #if _LIBCPP_HAS_CHAR8_T
-    install(new codecvt_byname<char16_t, char8_t, mbstate_t>(name_));
-    install(new codecvt_byname<char32_t, char8_t, mbstate_t>(name_));
+  install(new codecvt_byname<char16_t, char8_t, mbstate_t>(name_));
+  install(new codecvt_byname<char32_t, char8_t, mbstate_t>(name_));
 #endif
-    install(new numpunct_byname<char>(name_));
+  install(new numpunct_byname<char>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new numpunct_byname<wchar_t>(name_));
+  install(new numpunct_byname<wchar_t>(name_));
 #endif
-    install(new moneypunct_byname<char, false>(name_));
-    install(new moneypunct_byname<char, true>(name_));
+  install(new moneypunct_byname<char, false>(name_));
+  install(new moneypunct_byname<char, true>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new moneypunct_byname<wchar_t, false>(name_));
-    install(new moneypunct_byname<wchar_t, true>(name_));
+  install(new moneypunct_byname<wchar_t, false>(name_));
+  install(new moneypunct_byname<wchar_t, true>(name_));
 #endif
-    install(new time_get_byname<char>(name_));
+  install(new time_get_byname<char>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new time_get_byname<wchar_t>(name_));
+  install(new time_get_byname<wchar_t>(name_));
 #endif
-    install(new time_put_byname<char>(name_));
+  install(new time_put_byname<char>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new time_put_byname<wchar_t>(name_));
+  install(new time_put_byname<wchar_t>(name_));
 #endif
-    install(new messages_byname<char>(name_));
+  install(new messages_byname<char>(name_));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-    install(new messages_byname<wchar_t>(name_));
+  install(new messages_byname<wchar_t>(name_));
 #endif
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    for (unsigned i = 0; i < facets_.size(); ++i)
-      if (facets_[i])
-        facets_[i]->__release_shared();
-    throw;
-  }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+  guard.__complete();
 }
 
 locale::__imp::__imp(const __imp& other) : facets_(max<size_t>(N, other.facets_.size())), name_(other.name_) {
@@ -287,71 +282,66 @@ locale::__imp::__imp(const __imp& other, const string& name, locale::category c)
   for (unsigned i = 0; i < facets_.size(); ++i)
     if (facets_[i])
       facets_[i]->__add_shared();
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    if (c & locale::collate) {
-      install(new collate_byname<char>(name));
+  __exception_guard guard([&] {
+    for (unsigned i = 0; i < facets_.size(); ++i)
+      if (facets_[i])
+        facets_[i]->__release_shared();
+  });
+  if (c & locale::collate) {
+    install(new collate_byname<char>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new collate_byname<wchar_t>(name));
+    install(new collate_byname<wchar_t>(name));
 #endif
-    }
-    if (c & locale::ctype) {
-      install(new ctype_byname<char>(name));
+  }
+  if (c & locale::ctype) {
+    install(new ctype_byname<char>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new ctype_byname<wchar_t>(name));
+    install(new ctype_byname<wchar_t>(name));
 #endif
-      install(new codecvt_byname<char, char, mbstate_t>(name));
+    install(new codecvt_byname<char, char, mbstate_t>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new codecvt_byname<wchar_t, char, mbstate_t>(name));
+    install(new codecvt_byname<wchar_t, char, mbstate_t>(name));
 #endif
-      _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-      install(new codecvt_byname<char16_t, char, mbstate_t>(name));
-      install(new codecvt_byname<char32_t, char, mbstate_t>(name));
-      _LIBCPP_SUPPRESS_DEPRECATED_POP
+    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+    install(new codecvt_byname<char16_t, char, mbstate_t>(name));
+    install(new codecvt_byname<char32_t, char, mbstate_t>(name));
+    _LIBCPP_SUPPRESS_DEPRECATED_POP
 #if _LIBCPP_HAS_CHAR8_T
-      install(new codecvt_byname<char16_t, char8_t, mbstate_t>(name));
-      install(new codecvt_byname<char32_t, char8_t, mbstate_t>(name));
+    install(new codecvt_byname<char16_t, char8_t, mbstate_t>(name));
+    install(new codecvt_byname<char32_t, char8_t, mbstate_t>(name));
 #endif
-    }
-    if (c & locale::monetary) {
-      install(new moneypunct_byname<char, false>(name));
-      install(new moneypunct_byname<char, true>(name));
+  }
+  if (c & locale::monetary) {
+    install(new moneypunct_byname<char, false>(name));
+    install(new moneypunct_byname<char, true>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new moneypunct_byname<wchar_t, false>(name));
-      install(new moneypunct_byname<wchar_t, true>(name));
+    install(new moneypunct_byname<wchar_t, false>(name));
+    install(new moneypunct_byname<wchar_t, true>(name));
 #endif
-    }
-    if (c & locale::numeric) {
-      install(new numpunct_byname<char>(name));
+  }
+  if (c & locale::numeric) {
+    install(new numpunct_byname<char>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new numpunct_byname<wchar_t>(name));
+    install(new numpunct_byname<wchar_t>(name));
 #endif
-    }
-    if (c & locale::time) {
-      install(new time_get_byname<char>(name));
+  }
+  if (c & locale::time) {
+    install(new time_get_byname<char>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new time_get_byname<wchar_t>(name));
+    install(new time_get_byname<wchar_t>(name));
 #endif
-      install(new time_put_byname<char>(name));
+    install(new time_put_byname<char>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new time_put_byname<wchar_t>(name));
+    install(new time_put_byname<wchar_t>(name));
 #endif
-    }
-    if (c & locale::messages) {
-      install(new messages_byname<char>(name));
+  }
+  if (c & locale::messages) {
+    install(new messages_byname<char>(name));
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install(new messages_byname<wchar_t>(name));
+    install(new messages_byname<wchar_t>(name));
 #endif
-    }
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    for (unsigned i = 0; i < facets_.size(); ++i)
-      if (facets_[i])
-        facets_[i]->__release_shared();
-    throw;
   }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+  guard.__complete();
 }
 
 template <class F>
@@ -366,87 +356,83 @@ locale::__imp::__imp(const __imp& other, const __imp& one, locale::category c)
   for (unsigned i = 0; i < facets_.size(); ++i)
     if (facets_[i])
       facets_[i]->__add_shared();
-#if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_EXCEPTIONS
-    if (c & locale::collate) {
-      install_from<std::collate<char> >(one);
+  __exception_guard guard([&] {
+    for (unsigned i = 0; i < facets_.size(); ++i)
+      if (facets_[i])
+        facets_[i]->__release_shared();
+  });
+
+  if (c & locale::collate) {
+    install_from<std::collate<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<std::collate<wchar_t> >(one);
+    install_from<std::collate<wchar_t> >(one);
 #endif
-    }
-    if (c & locale::ctype) {
-      install_from<std::ctype<char> >(one);
+  }
+  if (c & locale::ctype) {
+    install_from<std::ctype<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<std::ctype<wchar_t> >(one);
+    install_from<std::ctype<wchar_t> >(one);
 #endif
-      install_from<std::codecvt<char, char, mbstate_t> >(one);
-      _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-      install_from<std::codecvt<char16_t, char, mbstate_t> >(one);
-      install_from<std::codecvt<char32_t, char, mbstate_t> >(one);
-      _LIBCPP_SUPPRESS_DEPRECATED_POP
+    install_from<std::codecvt<char, char, mbstate_t> >(one);
+    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+    install_from<std::codecvt<char16_t, char, mbstate_t> >(one);
+    install_from<std::codecvt<char32_t, char, mbstate_t> >(one);
+    _LIBCPP_SUPPRESS_DEPRECATED_POP
 #if _LIBCPP_HAS_CHAR8_T
-      install_from<std::codecvt<char16_t, char8_t, mbstate_t> >(one);
-      install_from<std::codecvt<char32_t, char8_t, mbstate_t> >(one);
+    install_from<std::codecvt<char16_t, char8_t, mbstate_t> >(one);
+    install_from<std::codecvt<char32_t, char8_t, mbstate_t> >(one);
 #endif
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<std::codecvt<wchar_t, char, mbstate_t> >(one);
+    install_from<std::codecvt<wchar_t, char, mbstate_t> >(one);
 #endif
-    }
-    if (c & locale::monetary) {
-      install_from<moneypunct<char, false> >(one);
-      install_from<moneypunct<char, true> >(one);
+  }
+  if (c & locale::monetary) {
+    install_from<moneypunct<char, false> >(one);
+    install_from<moneypunct<char, true> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<moneypunct<wchar_t, false> >(one);
-      install_from<moneypunct<wchar_t, true> >(one);
+    install_from<moneypunct<wchar_t, false> >(one);
+    install_from<moneypunct<wchar_t, true> >(one);
 #endif
-      install_from<money_get<char> >(one);
+    install_from<money_get<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<money_get<wchar_t> >(one);
+    install_from<money_get<wchar_t> >(one);
 #endif
-      install_from<money_put<char> >(one);
+    install_from<money_put<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<money_put<wchar_t> >(one);
+    install_from<money_put<wchar_t> >(one);
 #endif
-    }
-    if (c & locale::numeric) {
-      install_from<numpunct<char> >(one);
+  }
+  if (c & locale::numeric) {
+    install_from<numpunct<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<numpunct<wchar_t> >(one);
+    install_from<numpunct<wchar_t> >(one);
 #endif
-      install_from<num_get<char> >(one);
+    install_from<num_get<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<num_get<wchar_t> >(one);
+    install_from<num_get<wchar_t> >(one);
 #endif
-      install_from<num_put<char> >(one);
+    install_from<num_put<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<num_put<wchar_t> >(one);
+    install_from<num_put<wchar_t> >(one);
 #endif
-    }
-    if (c & locale::time) {
-      install_from<time_get<char> >(one);
+  }
+  if (c & locale::time) {
+    install_from<time_get<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<time_get<wchar_t> >(one);
+    install_from<time_get<wchar_t> >(one);
 #endif
-      install_from<time_put<char> >(one);
+    install_from<time_put<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<time_put<wchar_t> >(one);
+    install_from<time_put<wchar_t> >(one);
 #endif
-    }
-    if (c & locale::messages) {
-      install_from<std::messages<char> >(one);
+  }
+  if (c & locale::messages) {
+    install_from<std::messages<char> >(one);
 #if _LIBCPP_HAS_WIDE_CHARACTERS
-      install_from<std::messages<wchar_t> >(one);
+    install_from<std::messages<wchar_t> >(one);
 #endif
-    }
-#if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    for (unsigned i = 0; i < facets_.size(); ++i)
-      if (facets_[i])
-        facets_[i]->__release_shared();
-    throw;
   }
-#endif // _LIBCPP_HAS_EXCEPTIONS
+  guard.__complete();
 }
 
 locale::__imp::__imp(const __imp& other, facet* f, long id)
diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
index fc5625d..523c316 100644
--- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -2222,10 +2222,11 @@ void test_utf16_ucs2_codecvts() {
 #endif
 }
 
-int main() {
+int main(int, char**) {
   test_utf8_utf32_codecvts();
   test_utf8_utf16_codecvts();
   test_utf8_ucs2_codecvts();
   test_utf16_utf32_codecvts();
   test_utf16_ucs2_codecvts();
+  return 0;
 }
diff --git a/libcxx/test/std/ranges/range.access/include.iterator.pass.cpp b/libcxx/test/std/ranges/range.access/include.iterator.pass.cpp
index bb2cda0..947eb17 100644
--- a/libcxx/test/std/ranges/range.access/include.iterator.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/include.iterator.pass.cpp
@@ -62,7 +62,8 @@ constexpr bool test() {
   return true;
 }
 
-int main() {
+int main(int, char**) {
   test();
   static_assert(test());
+  return 0;
 }
diff --git a/libcxx/utils/ci/BOT_OWNERS.txt b/libcxx/utils/ci/BOT_OWNERS.txt
index 90f8272..3a92ba2 100644
--- a/libcxx/utils/ci/BOT_OWNERS.txt
+++ b/libcxx/utils/ci/BOT_OWNERS.txt
@@ -21,3 +21,8 @@ N: Android libc++
 E: pirama@google.com, sharjeelkhan@google.com
 G: pirama-arumuga-nainar, Sharjeel-Khan
 D: Emulator-based x86[-64] libc++ CI testing
+
+N: FreeBSD libc++
+E: emaste@freebsd.org
+G: emaste
+D: FreeBSD x86-64 libc++ CI testing
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index 3f3e6b4..e7fda65 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -103,6 +103,7 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
+    skip: "Until https://github.com/llvm/llvm-project/issues/162516 has been resolved"
 
   - label: AIX (64-bit)
     command: libcxx/utils/ci/run-buildbot aix
@@ -114,6 +115,7 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
+    skip: "Until https://github.com/llvm/llvm-project/issues/162516 has been resolved"
 
 - group: ':freebsd: FreeBSD'
   steps:
diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py
index 9a97e61..2d3a72c 100644
--- a/libcxx/utils/libcxx/test/dsl.py
+++ b/libcxx/utils/libcxx/test/dsl.py
@@ -296,7 +296,7 @@ def hasAnyLocale(config, locales):
         + name_string_literals
         + """, nullptr,
       };
-      int main() {
+      int main(int, char**) {
         for (size_t i = 0; test_locale_names[i]; i++) {
           if (::setlocale(LC_ALL, test_locale_names[i]) != NULL) {
             return 0;
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 0070b76..7d6e78d 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -30,7 +30,7 @@ def _getAndroidDeviceApi(cfg):
             r"""
                 #include <android/api-level.h>
                 #include <stdio.h>
-                int main() {
+                int main(int, char**) {
                     printf("%d\n", android_get_device_api_level());
                     return 0;
                 }
@@ -66,7 +66,7 @@ def _mingwSupportsModules(cfg):
         #else
         // __MINGW64_VERSION_MAJOR > 12 should be ok.
         #endif
-        int main() { return 0; }
+        int main(int, char**) { return 0; }
         """,
     )
 
@@ -474,7 +474,7 @@ def _getLocaleFlagsAction(cfg, locale, alts, members):
         #include <wchar.h>
 
         // Print each requested locale conversion member on separate lines.
-        int main() {
+        int main(int, char**) {
           const char* locales[] = { %s };
           for (int loc_i = 0; loc_i < %d; ++loc_i) {
             if (!setlocale(LC_ALL, locales[loc_i])) {
@@ -629,7 +629,7 @@ DEFAULT_FEATURES += [
             """
             #include <stdio.h>
             #include <windows.h>
-            int main() {
+            int main(int, char**) {
               CHAR tempDirPath[MAX_PATH];
               DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath);
               if (tempPathRet == 0 || tempPathRet > MAX_PATH) {
diff --git a/libcxxabi/test/forced_unwind4.pass.cpp b/libcxxabi/test/forced_unwind4.pass.cpp
index 15efca8..fbc8128 100644
--- a/libcxxabi/test/forced_unwind4.pass.cpp
+++ b/libcxxabi/test/forced_unwind4.pass.cpp
@@ -17,9 +17,7 @@
 
 // Android/Bionic does not support pthread_cancel.
 #ifdef __BIONIC__
-int main() {
-  return 0;
-}
+int main(int, char**) { return 0; }
 #else
 
 #include <chrono>
@@ -45,7 +43,7 @@ static void* test(void* arg) {
   return (void*)1;
 }
 
-int main() {
+int main(int, char**) {
   pthread_t child_thread;
   std::unique_lock<std::mutex> lk(cv_m);
   pthread_create(&child_thread, 0, test, (void*)0);
diff --git a/libcxxabi/test/native/AArch64/ra_sign_state.pass.cpp b/libcxxabi/test/native/AArch64/ra_sign_state.pass.cpp
index 35c05fc..dec7aca 100644
--- a/libcxxabi/test/native/AArch64/ra_sign_state.pass.cpp
+++ b/libcxxabi/test/native/AArch64/ra_sign_state.pass.cpp
@@ -53,7 +53,7 @@ void bazz() {
   }
 }
 
-int main() {
+int main(int, char**) {
   try {
     bazz();
   } catch (int i) {
diff --git a/libunwind/test/aarch64_vg_unwind.pass.cpp b/libunwind/test/aarch64_vg_unwind.pass.cpp
index b6bb969..1c139a7 100644
--- a/libunwind/test/aarch64_vg_unwind.pass.cpp
+++ b/libunwind/test/aarch64_vg_unwind.pass.cpp
@@ -62,4 +62,7 @@ __attribute__((noinline)) void foo() {
   // smstop sm
 }
 
-int main() { foo(); }
+int main(int, char **) {
+  foo();
+  return 0;
+}
diff --git a/libunwind/test/aix_signal_unwind.pass.sh.S b/libunwind/test/aix_signal_unwind.pass.sh.S
index 2c0cf14..bd0b8ac 100644
--- a/libunwind/test/aix_signal_unwind.pass.sh.S
+++ b/libunwind/test/aix_signal_unwind.pass.sh.S
@@ -126,10 +126,11 @@ extern "C" __attribute__((noinline)) void foo() {
   bar();
 }
 
-int main() {
+int main(int, char**) {
   // Set signal handler for SIGSEGV.
   signal(SIGSEGV, handler);
   foo();
+  return 0;
 }
 
 #else // Assembly code for abc().
diff --git a/libunwind/test/bad_unwind_info.pass.cpp b/libunwind/test/bad_unwind_info.pass.cpp
index b3284e8..272a83f 100644
--- a/libunwind/test/bad_unwind_info.pass.cpp
+++ b/libunwind/test/bad_unwind_info.pass.cpp
@@ -77,4 +77,7 @@ extern "C" void stepper() {
   assert(unw_step(&cursor) <= 0);
 }
 
-int main() { bad_unwind_info(); }
+int main(int, char **) {
+  bad_unwind_info();
+  return 0;
+}
diff --git a/libunwind/test/eh_frame_fde_pc_range.pass.cpp b/libunwind/test/eh_frame_fde_pc_range.pass.cpp
index 852612b..795ce66 100644
--- a/libunwind/test/eh_frame_fde_pc_range.pass.cpp
+++ b/libunwind/test/eh_frame_fde_pc_range.pass.cpp
@@ -21,7 +21,7 @@
 
 // RUN: %{build}
 // RUN: %{objcopy} --dump-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
-// RUN: echo -ne '\xFF' | dd of=%t_ehf_hdr.bin bs=1 seek=2 count=2 conv=notrunc status=none 
+// RUN: echo -ne '\xFF' | dd of=%t_ehf_hdr.bin bs=1 seek=2 count=2 conv=notrunc status=none
 // RUN: %{objcopy} --update-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
 // RUN: %{exec} %t.exe
 
@@ -53,7 +53,7 @@ void f() {
   assert(fde_fpc == fde_fpc1);
 }
 
-int main() {
+int main(int, char **) {
   f();
   return 0;
 }
diff --git a/libunwind/test/floatregister.pass.cpp b/libunwind/test/floatregister.pass.cpp
index ce4481b..018b792 100644
--- a/libunwind/test/floatregister.pass.cpp
+++ b/libunwind/test/floatregister.pass.cpp
@@ -52,7 +52,7 @@ __attribute__((noinline)) void foo() {
   _Unwind_Backtrace(frame_handler, NULL);
 }
 
-__attribute__((section("main_func"))) int main() {
+__attribute__((section("main_func"))) int main(int, char **) {
   foo();
   return -2;
 }
diff --git a/libunwind/test/forceunwind.pass.cpp b/libunwind/test/forceunwind.pass.cpp
index 344034e..9e032fc 100644
--- a/libunwind/test/forceunwind.pass.cpp
+++ b/libunwind/test/forceunwind.pass.cpp
@@ -72,7 +72,7 @@ __attribute__((noinline)) void foo() {
   _Unwind_ForcedUnwind(e, stop, (void *)&foo);
 }
 
-__attribute__((section("main_func"))) int main() {
+__attribute__((section("main_func"))) int main(int, char **) {
   foo();
   return -2;
 }
diff --git a/libunwind/test/unw_resume.pass.cpp b/libunwind/test/unw_resume.pass.cpp
index 2b7470b..e1f40b4 100644
--- a/libunwind/test/unw_resume.pass.cpp
+++ b/libunwind/test/unw_resume.pass.cpp
@@ -25,7 +25,7 @@ __attribute__((noinline)) void test_unw_resume() {
   unw_resume(&cursor);
 }
 
-int main() {
+int main(int, char **) {
   test_unw_resume();
   return 0;
 }
diff --git a/libunwind/test/unwind_scalable_vectors.pass.cpp b/libunwind/test/unwind_scalable_vectors.pass.cpp
index a5c5947..57ef4d7 100644
--- a/libunwind/test/unwind_scalable_vectors.pass.cpp
+++ b/libunwind/test/unwind_scalable_vectors.pass.cpp
@@ -34,7 +34,10 @@ __attribute__((noinline)) static void foo() {
   asm volatile("" ::"vr"(v));  // Dummy inline asm to use v.
 }
 
-int main() { foo(); }
+int main(int, char **) {
+  foo();
+  return 0;
+}
 #else
-int main() { return 0; }
+int main(int, char **) { return 0; }
 #endif
diff --git a/lldb/source/Target/StructuredDataPlugin.cpp b/lldb/source/Target/StructuredDataPlugin.cpp
index 8e3ceb0..8ce7f9f 100644
--- a/lldb/source/Target/StructuredDataPlugin.cpp
+++ b/lldb/source/Target/StructuredDataPlugin.cpp
@@ -52,7 +52,7 @@ void StructuredDataPlugin::InitializeBasePluginForDebugger(Debugger &debugger) {
     if (!parent_command)
       return;
 
-    // Create the structured-data ommand object.
+    // Create the structured-data command object.
     auto command_name = "structured-data";
     auto command_sp = CommandObjectSP(new CommandStructuredData(interpreter));
 
diff --git a/lldb/test/API/commands/expression/import-std-module/deque-basic/TestDequeFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/deque-basic/TestDequeFromStdModule.py
index 0fb6e88..5e819b7 100644
--- a/lldb/test/API/commands/expression/import-std-module/deque-basic/TestDequeFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/deque-basic/TestDequeFromStdModule.py
@@ -11,6 +11,9 @@ class TestBasicDeque(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/deque-dbg-info-content/TestDbgInfoContentDequeFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/deque-dbg-info-content/TestDbgInfoContentDequeFromStdModule.py
index e631a87..e8676b2 100644
--- a/lldb/test/API/commands/expression/import-std-module/deque-dbg-info-content/TestDbgInfoContentDequeFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/deque-dbg-info-content/TestDbgInfoContentDequeFromStdModule.py
@@ -12,6 +12,9 @@ class TestDbgInfoContentDeque(TestBase):
     @skipIf(compiler=no_match("clang"))
     @skipIf(compiler="clang", compiler_version=["<", "18.0"])
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/forward_list/TestForwardListFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/forward_list/TestForwardListFromStdModule.py
index a6ba081..9581155 100644
--- a/lldb/test/API/commands/expression/import-std-module/forward_list/TestForwardListFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/forward_list/TestForwardListFromStdModule.py
@@ -11,6 +11,9 @@ class TestBasicForwardList(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/list-dbg-info-content/TestDbgInfoContentListFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/list-dbg-info-content/TestDbgInfoContentListFromStdModule.py
index b26bd7d..923551c 100644
--- a/lldb/test/API/commands/expression/import-std-module/list-dbg-info-content/TestDbgInfoContentListFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/list-dbg-info-content/TestDbgInfoContentListFromStdModule.py
@@ -13,6 +13,9 @@ class TestDbgInfoContentList(TestBase):
     @skipIf(compiler=no_match("clang"))
     @skipIf(compiler="clang", compiler_version=["<", "12.0"])
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/list/TestListFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/list/TestListFromStdModule.py
index 6253a35..d6c8892 100644
--- a/lldb/test/API/commands/expression/import-std-module/list/TestListFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/list/TestListFromStdModule.py
@@ -11,6 +11,9 @@ class TestBasicList(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 01ca8da..1694a33 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -42,6 +42,7 @@ __OMP_TYPE(Double)
 
 OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))
 OMP_TYPE(Int63, Type::getIntNTy(Ctx, 63))
+OMP_TYPE(FuncPtrTy, PointerType::get(Ctx, M.getDataLayout().getProgramAddressSpace()))
 
 __OMP_PTR_TYPE(VoidPtr)
 __OMP_PTR_TYPE(VoidPtrPtr)
@@ -471,7 +472,7 @@ __OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr, KernelLaunchEn
 __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
-          VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+          FuncPtrTy, VoidPtr, VoidPtrPtr, SizeTy)
 __OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
 __OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
 __OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 6529412..f3839c9 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -729,7 +729,8 @@ namespace llvm {
     /// \param Subscripts   Subscripts.
     LLVM_ABI DICompositeType *createVectorType(uint64_t Size,
                                                uint32_t AlignInBits, DIType *Ty,
-                                               DINodeArray Subscripts);
+                                               DINodeArray Subscripts,
+                                               Metadata *BitStride = nullptr);
 
     /// Create debugging information entry for an
     /// enumeration.
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index 5f7225e..a426fb0 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -555,6 +556,7 @@ public:
     Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {}
     LLVM_ABI Argument(StringRef Key, DebugLoc dl);
     LLVM_ABI Argument(StringRef Key, InstructionCost C);
+    LLVM_ABI Argument(StringRef Key, BranchProbability P);
   };
 
   /// \p PassName is the name of the pass emitting this diagnostic. \p
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 96da698..8856eda 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1983,16 +1983,16 @@ def int_experimental_vector_match : DefaultAttrsIntrinsic<
                              [ llvm_anyvector_ty,
                                llvm_anyvector_ty,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],  // Mask
-                             [ IntrNoMem ]>;
+                             [ IntrNoMem, IntrSpeculatable ]>;
 
 // Extract based on mask bits
 def int_experimental_vector_extract_last_active:
     DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
               [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-               LLVMVectorElementType<0>], [IntrNoMem]>;
+               LLVMVectorElementType<0>], [IntrNoMem, IntrSpeculatable]>;
 
 // Operators
-let IntrProperties = [IntrNoMem] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   // Integer arithmetic
   def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
@@ -2039,26 +2039,6 @@ let IntrProperties = [IntrNoMem] in {
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
-  def int_vp_sdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                             [ LLVMMatchType<0>,
-                               LLVMMatchType<0>,
-                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                               llvm_i32_ty]>;
-  def int_vp_udiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                             [ LLVMMatchType<0>,
-                               LLVMMatchType<0>,
-                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                               llvm_i32_ty]>;
-  def int_vp_srem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                             [ LLVMMatchType<0>,
-                               LLVMMatchType<0>,
-                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                               llvm_i32_ty]>;
-  def int_vp_urem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                             [ LLVMMatchType<0>,
-                               LLVMMatchType<0>,
-                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                               llvm_i32_ty]>;
   def int_vp_abs : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                llvm_i1_ty,
@@ -2390,7 +2370,29 @@ let IntrProperties = [IntrNoMem] in {
                                    llvm_i32_ty]>;
 }
 
-let IntrProperties = [IntrNoMem, ImmArg<ArgIndex<1>>] in {
+// Integer VP division and remainder: not speculatable.
+def int_vp_sdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                            [ LLVMMatchType<0>,
+                              LLVMMatchType<0>,
+                              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                              llvm_i32_ty], [IntrNoMem]>;
+def int_vp_udiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                            [ LLVMMatchType<0>,
+                              LLVMMatchType<0>,
+                              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                              llvm_i32_ty], [IntrNoMem]>;
+def int_vp_srem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                            [ LLVMMatchType<0>,
+                              LLVMMatchType<0>,
+                              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                              llvm_i32_ty], [IntrNoMem]>;
+def int_vp_urem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                            [ LLVMMatchType<0>,
+                              LLVMMatchType<0>,
+                              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                              llvm_i32_ty], [IntrNoMem]>;
+
+let IntrProperties = [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>] in {
   def int_vp_ctlz : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                llvm_i1_ty,
@@ -2422,18 +2424,18 @@ def int_loop_dependence_war_mask:
 def int_get_active_lane_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [llvm_anyint_ty, LLVMMatchType<1>],
-            [IntrNoMem]>;
+            [IntrNoMem, IntrSpeculatable]>;
 
 def int_experimental_get_vector_length:
   DefaultAttrsIntrinsic<[llvm_i32_ty],
                         [llvm_anyint_ty, llvm_i32_ty, llvm_i1_ty],
-                        [IntrNoMem,
+                        [IntrNoMem, IntrSpeculatable,
                          ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 def int_experimental_cttz_elts:
   DefaultAttrsIntrinsic<[llvm_anyint_ty],
             [llvm_anyvector_ty, llvm_i1_ty],
-            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
 
 def int_experimental_vp_splice:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
@@ -2442,21 +2444,21 @@ def int_experimental_vp_splice:
              llvm_i32_ty,
              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
              llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]>;
 
 def int_experimental_vp_reverse:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                         [LLVMMatchType<0>,
                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                          llvm_i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, IntrSpeculatable]>;
 
 def int_experimental_vp_splat:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                         [LLVMVectorElementType<0>,
                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                          llvm_i32_ty],
-                        [IntrNoMem]>;
+                        [IntrNoMem, IntrSpeculatable]>;
 
 def int_vp_is_fpclass:
       DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
@@ -2753,16 +2755,22 @@ def int_preserve_static_offset : DefaultAttrsIntrinsic<[llvm_ptr_ty],
 
 def int_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                [LLVMMatchType<0>],
-                                               [IntrNoMem]>;
+                                               [IntrNoMem,
+                                                IntrSpeculatable]>;
 
 def int_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                               [LLVMMatchType<0>,
                                                LLVMMatchType<0>,
                                                llvm_i32_ty],
-                                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+                                              [IntrNoMem,
+                                               IntrSpeculatable,
+                                               ImmArg<ArgIndex<2>>]>;
 
 //===---------- Intrinsics to query properties of scalable vectors --------===//
-def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
+def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+                                       [],
+                                       [IntrNoMem,
+                                        IntrSpeculatable]>;
 
 //===---------- Intrinsics to perform subvector insertion/extraction ------===//
 def int_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
@@ -2776,18 +2784,22 @@ def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
 foreach n = 2...8 in {
   def int_vector_interleave#n   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                        !listsplat(LLVMOneNthElementsVectorType<0, n>, n),
-                                                       [IntrNoMem]>;
+                                                       [IntrNoMem,
+                                                        IntrSpeculatable]>;
 
   def int_vector_deinterleave#n : DefaultAttrsIntrinsic<!listsplat(LLVMOneNthElementsVectorType<0, n>, n),
                                                        [llvm_anyvector_ty],
-                                                       [IntrNoMem]>;
+                                                       [IntrNoMem,
+                                                        IntrSpeculatable]>;
 }
 
 //===-------------- Intrinsics to perform partial reduction ---------------===//
 
 def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
-                                                          [llvm_anyvector_ty, llvm_anyvector_ty],
-                                                          [IntrNoMem]>;
+                                                          [llvm_anyvector_ty,
+                                                           llvm_anyvector_ty],
+                                                          [IntrNoMem,
+                                                           IntrSpeculatable]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b744537..45c889c 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -329,6 +329,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
 
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
+      CE->getOpcode() == Instruction::PtrToAddr ||
       CE->getOpcode() == Instruction::BitCast)
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL,
                                       DSOEquiv);
@@ -1495,22 +1496,22 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
   default:
     llvm_unreachable("Missing case");
   case Instruction::PtrToAddr:
-    // TODO: Add some of the ptrtoint folds here as well.
-    break;
   case Instruction::PtrToInt:
     if (auto *CE = dyn_cast<ConstantExpr>(C)) {
       Constant *FoldedValue = nullptr;
-      // If the input is a inttoptr, eliminate the pair.  This requires knowing
+      // If the input is an inttoptr, eliminate the pair.  This requires knowing
       // the width of a pointer, so it can't be done in ConstantExpr::getCast.
       if (CE->getOpcode() == Instruction::IntToPtr) {
-        // zext/trunc the inttoptr to pointer size.
-        FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0),
-                                              DL.getIntPtrType(CE->getType()),
+        // zext/trunc the inttoptr to pointer/address size.
+        Type *MidTy = Opcode == Instruction::PtrToInt
+                          ? DL.getAddressType(CE->getType())
+                          : DL.getIntPtrType(CE->getType());
+        FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0), MidTy,
                                               /*IsSigned=*/false, DL);
       } else if (auto *GEP = dyn_cast<GEPOperator>(CE)) {
         // If we have GEP, we can perform the following folds:
-        // (ptrtoint (gep null, x)) -> x
-        // (ptrtoint (gep (gep null, x), y) -> x + y, etc.
+        // (ptrtoint/ptrtoaddr (gep null, x)) -> x
+        // (ptrtoint/ptrtoaddr (gep (gep null, x), y) -> x + y, etc.
         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
         APInt BaseOffset(BitWidth, 0);
         auto *Base = cast<Constant>(GEP->stripAndAccumulateConstantOffsets(
@@ -1518,7 +1519,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
         if (Base->isNullValue()) {
           FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
         } else {
-          // ptrtoint (gep i8, Ptr, (sub 0, V)) -> sub (ptrtoint Ptr), V
+          // ptrtoint/ptrtoaddr (gep i8, Ptr, (sub 0, V))
+          //   -> sub (ptrtoint/ptrtoaddr Ptr), V
           if (GEP->getNumIndices() == 1 &&
               GEP->getSourceElementType()->isIntegerTy(8)) {
             auto *Ptr = cast<Constant>(GEP->getPointerOperand());
@@ -1528,12 +1530,13 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
                 Sub->getOpcode() == Instruction::Sub &&
                 Sub->getOperand(0)->isNullValue())
               FoldedValue = ConstantExpr::getSub(
-                  ConstantExpr::getPtrToInt(Ptr, IntIdxTy), Sub->getOperand(1));
+                  ConstantExpr::getCast(Opcode, Ptr, IntIdxTy),
+                  Sub->getOperand(1));
           }
         }
       }
       if (FoldedValue) {
-        // Do a zext or trunc to get to the ptrtoint dest size.
+        // Do a zext or trunc to get to the ptrtoint/ptrtoaddr dest size.
         return ConstantFoldIntegerCast(FoldedValue, DestTy, /*IsSigned=*/false,
                                        DL);
       }
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 4c2e1fe..54f55b2 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -812,7 +812,9 @@ static bool isPointerUseReplacable(const Use &U) {
     auto *User = Worklist.pop_back_val();
     if (!Visited.insert(User).second)
       continue;
-    if (isa<ICmpInst, PtrToIntInst>(User))
+    // FIXME: The PtrToIntInst case here is not strictly correct, as it
+    // changes which provenance is exposed.
+    if (isa<ICmpInst, PtrToIntInst, PtrToAddrInst>(User))
       continue;
     if (isa<PHINode, SelectInst>(User))
       Worklist.append(User->user_begin(), User->user_end());
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6f6776c..30bcff7 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -15749,51 +15749,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
       return RewriteMap.lookup_or(S, S);
     };
 
-    // Check for the SCEV expression (A /u B) * B while B is a constant, inside
-    // \p Expr. The check is done recuresively on \p Expr, which is assumed to
-    // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A
-    // /u B) * B was found, and return the divisor B in \p DividesBy. For
-    // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since
-    // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p
-    // DividesBy.
-    std::function<bool(const SCEV *, const SCEV *&)> HasDivisibiltyInfo =
-        [&](const SCEV *Expr, const SCEV *&DividesBy) {
-          if (auto *Mul = dyn_cast<SCEVMulExpr>(Expr)) {
-            if (Mul->getNumOperands() != 2)
-              return false;
-            auto *MulLHS = Mul->getOperand(0);
-            auto *MulRHS = Mul->getOperand(1);
-            if (isa<SCEVConstant>(MulLHS))
-              std::swap(MulLHS, MulRHS);
-            if (auto *Div = dyn_cast<SCEVUDivExpr>(MulLHS))
-              if (Div->getOperand(1) == MulRHS) {
-                DividesBy = MulRHS;
-                return true;
-              }
-          }
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
-            return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) ||
-                   HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy);
-          return false;
-        };
-
-    // Return true if Expr known to divide by \p DividesBy.
-    std::function<bool(const SCEV *, const SCEV *&)> IsKnownToDivideBy =
-        [&](const SCEV *Expr, const SCEV *DividesBy) {
-          if (SE.getURemExpr(Expr, DividesBy)->isZero())
-            return true;
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
-            return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) &&
-                   IsKnownToDivideBy(MinMax->getOperand(1), DividesBy);
-          return false;
-        };
-
     const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
     const SCEV *DividesBy = nullptr;
-    if (HasDivisibiltyInfo(RewrittenLHS, DividesBy))
-      // Check that the whole expression is divided by DividesBy
-      DividesBy =
-          IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr;
+    const APInt &Multiple = SE.getConstantMultiple(RewrittenLHS);
+    if (!Multiple.isOne())
+      DividesBy = SE.getConstant(Multiple);
 
     // Collect rewrites for LHS and its transitive operands based on the
     // condition.
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3a9651c..89ed4da 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -110,6 +110,7 @@ STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
 STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
 STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
 STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
+STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
 
 /// A command line option to turn software pipelining on or off.
 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
@@ -193,6 +194,13 @@ static cl::opt<bool>
     MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
                cl::desc("Use the MVE code generator for software pipelining"));
 
+/// A command line argument to limit the number of store instructions in the
+/// target basic block.
+static cl::opt<unsigned> SwpMaxNumStores(
+    "pipeliner-max-num-stores",
+    cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
+    cl::init(200));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -544,6 +552,23 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
     return false;
   }
 
+  unsigned NumStores = 0;
+  for (MachineInstr &MI : *L.getHeader())
+    if (MI.mayStore())
+      ++NumStores;
+  if (NumStores > SwpMaxNumStores) {
+    LLVM_DEBUG(dbgs() << "Too many stores\n");
+    NumFailTooManyStores++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Too many store instructions in the loop: "
+             << ore::NV("NumStores", NumStores) << " > "
+             << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
+    });
+    return false;
+  }
+
   // Remove any subregisters from inputs to phi nodes.
   preprocessPhiNodes(*L.getHeader());
   return true;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5980ee3..286ed03 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3623,7 +3623,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
   // 1. Build a list of reduction variables.
   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   auto Size = ReductionInfos.size();
-  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
+  Type *FuncPtrTy =
+      Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
   Type *RedArrayTy = ArrayType::get(PtrTy, Size);
   CodeGenIP = Builder.saveIP();
   Builder.restoreIP(AllocaIP);
@@ -3667,9 +3669,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
       Builder.getInt64(MaxDataSize * ReductionInfos.size());
   if (!IsTeamsReduction) {
     Value *SarFuncCast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy);
     Value *WcFuncCast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
     Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
                      WcFuncCast};
     Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
@@ -10072,13 +10074,14 @@ void OpenMPIRBuilder::initializeTypes(Module &M) {
   LLVMContext &Ctx = M.getContext();
   StructType *T;
   unsigned DefaultTargetAS = Config.getDefaultTargetAS();
+  unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
 #define OMP_TYPE(VarName, InitValue) VarName = InitValue;
 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
   VarName##Ty = ArrayType::get(ElemTy, ArraySize);                             \
   VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
   VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg);            \
-  VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
+  VarName##Ptr = PointerType::get(Ctx, ProgramAS);
 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...)                      \
   T = StructType::getTypeByName(Ctx, StructName);                              \
   if (!T)                                                                      \
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1ae20a9f..07a870f 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -715,11 +715,20 @@ DICompositeType *DIBuilder::createArrayType(
 
 DICompositeType *DIBuilder::createVectorType(uint64_t Size,
                                              uint32_t AlignInBits, DIType *Ty,
-                                             DINodeArray Subscripts) {
-  auto *R = DICompositeType::get(VMContext, dwarf::DW_TAG_array_type, "",
-                                 nullptr, 0, nullptr, Ty, Size, AlignInBits, 0,
-                                 DINode::FlagVector, Subscripts, 0,
-                                 /*EnumKind=*/std::nullopt, nullptr);
+                                             DINodeArray Subscripts,
+                                             Metadata *BitStride) {
+  auto *R = DICompositeType::get(
+      VMContext, dwarf::DW_TAG_array_type, /*Name=*/"",
+      /*File=*/nullptr, /*Line=*/0, /*Scope=*/nullptr, /*BaseType=*/Ty,
+      /*SizeInBits=*/Size, /*AlignInBits=*/AlignInBits, /*OffsetInBits=*/0,
+      /*Flags=*/DINode::FlagVector, /*Elements=*/Subscripts,
+      /*RuntimeLang=*/0, /*EnumKind=*/std::nullopt, /*VTableHolder=*/nullptr,
+      /*TemplateParams=*/nullptr, /*Identifier=*/"",
+      /*Discriminator=*/nullptr, /*DataLocation=*/nullptr,
+      /*Associated=*/nullptr, /*Allocated=*/nullptr, /*Rank=*/nullptr,
+      /*Annotations=*/nullptr, /*Specification=*/nullptr,
+      /*NumExtraInhabitants=*/0,
+      /*BitStride=*/BitStride);
   trackIfUnresolved(R);
   return R;
 }
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 4f37624..8e6d654 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -273,6 +273,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   C.print(OS);
 }
 
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+                                                   BranchProbability P)
+    : Key(std::string(Key)) {
+  raw_string_ostream OS(Val);
+  P.print(OS);
+}
+
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc)
     : Key(std::string(Key)), Loc(Loc) {
   if (Loc) {
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 91e64e6..bd0a17d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -315,6 +315,8 @@ public:
   }
 
   void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) {
+    assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+           "expected SVE stack sizes to be aligned to 16-bytes");
     StackSizeZPR = ZPR;
     StackSizePPR = PPR;
     HasCalculatedStackSizeSVE = true;
@@ -425,6 +427,8 @@ public:
 
   // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
   void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) {
+    assert(isAligned(Align(16), ZPR) && isAligned(Align(16), PPR) &&
+           "expected SVE callee-save sizes to be aligned to 16-bytes");
     ZPRCalleeSavedStackSize = ZPR;
     PPRCalleeSavedStackSize = PPR;
     HasSVECalleeSavedStackSize = true;
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 1568161..f110558 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -60,7 +60,6 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
   case AArch64::PTRUE_C_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
-  case AArch64::SEH_SavePReg:
   case AArch64::SEH_SaveZReg:
     return true;
   }
@@ -75,6 +74,8 @@ static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) {
   case AArch64::LDR_PXI:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
+  case AArch64::SEH_SavePReg:
+    return true;
   }
 }
 
@@ -94,6 +95,26 @@ AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon(
 
   HasFP = AFL.hasFP(MF);
   NeedsWinCFI = AFL.needsWinCFI(MF);
+
+  // Windows unwind can't represent the required stack adjustments if we have
+  // both SVE callee-saves and dynamic stack allocations, and the frame pointer
+  // is before the SVE spills.  The allocation of the frame pointer must be the
+  // last instruction in the prologue so the unwinder can restore the stack
+  // pointer correctly. (And there isn't any unwind opcode for `addvl sp, x29,
+  // -17`.)
+  //
+  // Because of this, we do spills in the opposite order on Windows: first SVE,
+  // then GPRs. The main side-effect of this is that it makes accessing
+  // parameters passed on the stack more expensive.
+  //
+  // We could consider rearranging the spills for simpler cases.
+  if (Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize()) {
+    if (AFI->hasStackHazardSlotIndex())
+      reportFatalUsageError("SME hazard padding is not supported on Windows");
+    SVELayout = SVEStackLayout::CalleeSavesAboveFrameRecord;
+  } else if (AFI->hasSplitSVEObjects()) {
+    SVELayout = SVEStackLayout::Split;
+  }
 }
 
 MachineBasicBlock::iterator
@@ -334,6 +355,55 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump(
   return true;
 }
 
+SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const {
+  StackOffset PPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+  StackOffset ZPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+  StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
+  StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
+  if (SVELayout == SVEStackLayout::Split)
+    return {{PPRCalleeSavesSize, PPRLocalsSize},
+            {ZPRCalleeSavesSize, ZPRLocalsSize}};
+  // For simplicity, attribute all locals to ZPRs when split SVE is disabled.
+  return {{PPRCalleeSavesSize, StackOffset{}},
+          {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}};
+}
+
+struct SVEPartitions {
+  struct {
+    MachineBasicBlock::iterator Begin, End;
+  } PPR, ZPR;
+};
+
+static SVEPartitions partitionSVECS(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    StackOffset PPRCalleeSavesSize,
+                                    StackOffset ZPRCalleeSavesSize,
+                                    bool IsEpilogue) {
+  MachineBasicBlock::iterator PPRsI = MBBI;
+  MachineBasicBlock::iterator End =
+      IsEpilogue ? MBB.begin() : MBB.getFirstTerminator();
+  auto AdjustI = [&](auto MBBI) { return IsEpilogue ? std::prev(MBBI) : MBBI; };
+  // Process the SVE CS to find the starts/ends of the ZPR and PPR areas.
+  if (PPRCalleeSavesSize) {
+    PPRsI = AdjustI(PPRsI);
+    assert(isPartOfPPRCalleeSaves(*PPRsI) && "Unexpected instruction");
+    while (PPRsI != End && isPartOfPPRCalleeSaves(AdjustI(PPRsI)))
+      IsEpilogue ? (--PPRsI) : (++PPRsI);
+  }
+  MachineBasicBlock::iterator ZPRsI = PPRsI;
+  if (ZPRCalleeSavesSize) {
+    ZPRsI = AdjustI(ZPRsI);
+    assert(isPartOfZPRCalleeSaves(*ZPRsI) && "Unexpected instruction");
+    while (ZPRsI != End && isPartOfZPRCalleeSaves(AdjustI(ZPRsI)))
+      IsEpilogue ? (--ZPRsI) : (++ZPRsI);
+  }
+  if (IsEpilogue)
+    return {{PPRsI, MBBI}, {ZPRsI, PPRsI}};
+  return {{MBBI, PPRsI}, {PPRsI, ZPRsI}};
+}
+
 AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF,
                                                MachineBasicBlock &MBB,
                                                const AArch64FrameLowering &AFL)
@@ -613,30 +683,12 @@ void AArch64PrologueEmitter::emitPrologue() {
   bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
   unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
-  // Windows unwind can't represent the required stack adjustments if we have
-  // both SVE callee-saves and dynamic stack allocations, and the frame
-  // pointer is before the SVE spills.  The allocation of the frame pointer
-  // must be the last instruction in the prologue so the unwinder can restore
-  // the stack pointer correctly. (And there isn't any unwind opcode for
-  // `addvl sp, x29, -17`.)
-  //
-  // Because of this, we do spills in the opposite order on Windows: first SVE,
-  // then GPRs. The main side-effect of this is that it makes accessing
-  // parameters passed on the stack more expensive.
-  //
-  // We could consider rearranging the spills for simpler cases.
-  bool FPAfterSVECalleeSaves =
-      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
-  if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
-    reportFatalUsageError("SME hazard padding is not supported on Windows");
-
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
   determineLocalsStackSize(NumBytes, PrologueSaveSize);
 
   MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
-  if (FPAfterSVECalleeSaves) {
+  if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
     // If we're doing SVE saves first, we need to immediately allocate space
     // for fixed objects, then space for the SVE callee saves.
     //
@@ -712,110 +764,66 @@ void AArch64PrologueEmitter::emitPrologue() {
   if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
     emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
 
-  StackOffset PPRCalleeSavesSize =
-      StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
-  StackOffset ZPRCalleeSavesSize =
-      StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
-  StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize;
-  StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
-  StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
-
-  std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin,
-      ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd;
-
+  auto [PPR, ZPR] = getSVEStackFrameSizes();
+  StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+  StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes);
   StackOffset CFAOffset =
-      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+      StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize;
+
   MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
-  if (!FPAfterSVECalleeSaves) {
-    // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR
-    // areas.
-    PPRCalleeSavesBegin = AfterGPRSavesI;
-    if (PPRCalleeSavesSize) {
-      LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
-                        << PPRCalleeSavesSize.getScalable() << "\n");
-
-      assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) &&
-             "Unexpected instruction");
-      while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
-             AfterSVESavesI != MBB.getFirstTerminator())
-        ++AfterSVESavesI;
+  // Allocate space for the callee saves and PPR locals (if any).
+  if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) {
+    auto [PPRRange, ZPRRange] =
+        partitionSVECS(MBB, AfterGPRSavesI, PPR.CalleeSavesSize,
+                       ZPR.CalleeSavesSize, /*IsEpilogue=*/false);
+    AfterSVESavesI = ZPRRange.End;
+    if (EmitAsyncCFI)
+      emitCalleeSavedSVELocations(AfterSVESavesI);
+
+    StackOffset AllocateBeforePPRs = SVECalleeSavesSize;
+    StackOffset AllocateAfterPPRs = PPR.LocalsSize;
+    if (SVELayout == SVEStackLayout::Split) {
+      AllocateBeforePPRs = PPR.CalleeSavesSize;
+      AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize;
     }
-    PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI;
-    if (ZPRCalleeSavesSize) {
-      LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
-                        << ZPRCalleeSavesSize.getScalable() << "\n");
-      assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) &&
-             "Unexpected instruction");
-      while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
-             AfterSVESavesI != MBB.getFirstTerminator())
-        ++AfterSVESavesI;
-    }
-    ZPRCalleeSavesEnd = AfterSVESavesI;
-  }
-
-  if (EmitAsyncCFI)
-    emitCalleeSavedSVELocations(AfterSVESavesI);
-
-  if (AFI->hasSplitSVEObjects()) {
-    assert(!FPAfterSVECalleeSaves &&
-           "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
-    assert(!AFL.canUseRedZone(MF) &&
-           "Cannot use redzone with aarch64-split-sve-objects");
-    // TODO: Handle HasWinCFI/NeedsWinCFI?
-    assert(!NeedsWinCFI &&
-           "WinCFI with aarch64-split-sve-objects is not supported");
-
-    // Split ZPR and PPR allocation.
-    // Allocate PPR callee saves
-    allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
+    allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
-                           ZPRLocalsSize || PPRLocalsSize);
-    CFAOffset += PPRCalleeSavesSize;
-
-    // Allocate PPR locals + ZPR callee saves
-    assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
+                       MFI.hasVarSizedObjects() || AllocateAfterPPRs ||
+                           ZPR.LocalsSize || NonSVELocalsSize);
+    CFAOffset += AllocateBeforePPRs;
+    assert(PPRRange.End == ZPRRange.Begin &&
            "Expected ZPR callee saves after PPR locals");
-    allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
-                       PPRLocalsSize + ZPRCalleeSavesSize,
-                       EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects() || ZPRLocalsSize);
-    CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
-
-    // Allocate ZPR locals
-    allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
-                       ZPRLocalsSize + StackOffset::getFixed(NumBytes),
+    allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects());
+                       MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
+                           NonSVELocalsSize);
+    CFAOffset += AllocateAfterPPRs;
   } else {
-    // Allocate space for the callee saves (if any).
-    StackOffset LocalsSize =
-        PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
-    if (!FPAfterSVECalleeSaves)
-      allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize,
-                         EmitAsyncCFI && !HasFP, CFAOffset,
-                         MFI.hasVarSizedObjects() || LocalsSize);
+    assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
+    // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been
+    // allocated (and separate PPR locals are not supported, all SVE locals,
+    // both PPR and ZPR, are within the ZPR locals area).
+    assert(!PPR.LocalsSize && "Unexpected PPR locals!");
     CFAOffset += SVECalleeSavesSize;
+  }
 
-    // Allocate space for the rest of the frame including SVE locals. Align the
-    // stack as necessary.
-    assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
-           "Cannot use redzone with stack realignment");
-    if (!AFL.canUseRedZone(MF)) {
-      // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-      // the correct value here, as NumBytes also includes padding bytes,
-      // which shouldn't be counted here.
-      StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
-      allocateStackSpace(AfterSVESavesI, RealignmentPadding,
-                         SVELocalsSize + StackOffset::getFixed(NumBytes),
-                         EmitAsyncCFI && !HasFP, CFAOffset,
-                         MFI.hasVarSizedObjects());
-    }
+  // Allocate space for the rest of the frame including ZPR locals. Align the
+  // stack as necessary.
+  assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+         "Cannot use redzone with stack realignment");
+  if (!AFL.canUseRedZone(MF)) {
+    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the
+    // correct value here, as NumBytes also includes padding bytes, which
+    // shouldn't be counted here.
+    allocateStackSpace(
+        AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize,
+        EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects());
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
-  // stack pointer is at this point. Any variable size objects will be allocated
-  // after this, so we can still use the base pointer to reference locals.
+  // stack pointer is at this point. Any variable size objects will be
+  // allocated after this, so we can still use the base pointer to reference
+  // locals.
   //
   // FIXME: Clarify FrameSetup flags here.
   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
@@ -1270,7 +1278,9 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
         StackOffset::getScalable(MFI.getObjectOffset(FI)) -
         StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
 
-    if (AFI->hasSplitSVEObjects() &&
+    // The scalable vectors are below (lower address) the scalable predicates
+    // with split SVE objects, so we must subtract the size of the predicates.
+    if (SVELayout == SVEStackLayout::Split &&
         MFI.getStackID(FI) == TargetStackID::ScalableVector)
       Offset -= PPRStackSize;
 
@@ -1349,13 +1359,10 @@ void AArch64EpilogueEmitter::emitEpilogue() {
     return;
   }
 
-  bool FPAfterSVECalleeSaves =
-      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
   bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
   // Assume we can't combine the last pop with the sp restore.
   bool CombineAfterCSRBump = false;
-  if (FPAfterSVECalleeSaves) {
+  if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
     AfterCSRPopSize += FixedObject;
   } else if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
@@ -1390,7 +1397,8 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   while (FirstGPRRestoreI != Begin) {
     --FirstGPRRestoreI;
     if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
-        (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
+        (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord &&
+         isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
       ++FirstGPRRestoreI;
       break;
     } else if (CombineSPBump)
@@ -1414,13 +1422,9 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   if (HasFP && AFI->hasSwiftAsyncContext())
     emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
 
-  StackOffset ZPRStackSize = AFL.getZPRStackSize(MF);
-  StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
-  StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
-
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
-    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+    assert(!AFI->hasSVEStackSize() && "Cannot combine SP bump with SVE");
 
     // When we are about to restore the CSRs, the CFA register is SP again.
     if (EmitCFI && HasFP)
@@ -1437,188 +1441,122 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
-  if (!AFI->hasSplitSVEObjects()) {
-    // Process the SVE callee-saves to determine what space needs to be
-    // deallocated.
-    StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
-    MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
-                                RestoreEnd = FirstGPRRestoreI;
-    int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
-    int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
-    int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
-
-    if (SVECalleeSavedSize) {
-      if (FPAfterSVECalleeSaves)
-        RestoreEnd = MBB.getFirstTerminator();
-
-      RestoreBegin = std::prev(RestoreEnd);
-      while (RestoreBegin != MBB.begin() &&
-             isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
-        --RestoreBegin;
-
-      assert(isPartOfSVECalleeSaves(RestoreBegin) &&
-             isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
-             "Unexpected instruction");
-
-      StackOffset CalleeSavedSizeAsOffset =
-          StackOffset::getScalable(SVECalleeSavedSize);
-      DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
-      DeallocateAfter = CalleeSavedSizeAsOffset;
+  auto [PPR, ZPR] = getSVEStackFrameSizes();
+  auto [PPRRange, ZPRRange] = partitionSVECS(
+      MBB,
+      SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+          ? MBB.getFirstTerminator()
+          : FirstGPRRestoreI,
+      PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
+  StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
+  StackOffset SVEStackSize =
+      SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
+  MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
+  MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
+
+  // Deallocate the SVE area.
+  if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+    StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize;
+    // If the callee-save area is before FP, restoring the FP implicitly
+    // deallocates non-callee-save SVE allocations.  Otherwise, deallocate them
+    // explicitly.
+    if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
+      emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+                      SVELocalsSize, TII, MachineInstr::FrameDestroy, false,
+                      NeedsWinCFI, &HasWinCFI);
     }
 
-    // Deallocate the SVE area.
-    if (FPAfterSVECalleeSaves) {
-      // If the callee-save area is before FP, restoring the FP implicitly
-      // deallocates non-callee-save SVE allocations.  Otherwise, deallocate
-      // them explicitly.
-      if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
-        emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
-                        DeallocateBefore, TII, MachineInstr::FrameDestroy,
-                        false, NeedsWinCFI, &HasWinCFI);
-      }
+    // Deallocate callee-save non-SVE registers.
+    emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
 
-      // Deallocate callee-save non-SVE registers.
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(AFI->getCalleeSavedStackSize()),
-                      TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                      &HasWinCFI);
-
-      // Deallocate fixed objects.
-      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(FixedObject), TII,
-                      MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                      &HasWinCFI);
-
-      // Deallocate callee-save SVE registers.
-      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI);
-    } else if (SVEStackSize) {
-      int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
-      // If we have stack realignment or variable-sized objects we must use the
-      // FP to restore SVE callee saves (as there is an unknown amount of
-      // data/padding between the SP and SVE CS area).
-      Register BaseForSVEDealloc =
-          (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
-                                                                : AArch64::SP;
-      if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
-        Register CalleeSaveBase = AArch64::FP;
-        if (int64_t CalleeSaveBaseOffset =
-                AFI->getCalleeSaveBaseToFrameRecordOffset()) {
-          // If we have have an non-zero offset to the non-SVE CS base we need
-          // to compute the base address by subtracting the offest in a
-          // temporary register first (to avoid briefly deallocating the SVE
-          // CS).
-          CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
-              &AArch64::GPR64RegClass);
-          emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
-                          StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
-                          MachineInstr::FrameDestroy);
-        }
-        // The code below will deallocate the stack space space by moving the
-        // SP to the start of the SVE callee-save area.
-        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
-                        StackOffset::getScalable(-SVECalleeSavedSize), TII,
+    // Deallocate fixed objects.
+    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(FixedObject), TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
+
+    // Deallocate callee-save SVE registers.
+    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                    SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI, &HasWinCFI);
+  } else if (AFI->hasSVEStackSize()) {
+    // If we have stack realignment or variable-sized objects we must use the FP
+    // to restore SVE callee saves (as there is an unknown amount of
+    // data/padding between the SP and SVE CS area).
+    Register BaseForSVEDealloc =
+        (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+                                                              : AArch64::SP;
+    if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
+      // TODO: Support stack realigment and variable-sized objects.
+      assert(
+          SVELayout != SVEStackLayout::Split &&
+          "unexpected stack realignment or variable sized objects with split "
+          "SVE stack objects");
+
+      Register CalleeSaveBase = AArch64::FP;
+      if (int64_t CalleeSaveBaseOffset =
+              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+        // If we have have an non-zero offset to the non-SVE CS base we need to
+        // compute the base address by subtracting the offest in a temporary
+        // register first (to avoid briefly deallocating the SVE CS).
+        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+            &AArch64::GPR64RegClass);
+        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
                         MachineInstr::FrameDestroy);
-      } else if (BaseForSVEDealloc == AArch64::SP) {
-        if (SVECalleeSavedSize) {
-          // Deallocate the non-SVE locals first before we can deallocate (and
-          // restore callee saves) from the SVE area.
-          emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                          StackOffset::getFixed(NumBytes), TII,
-                          MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                          &HasWinCFI, EmitCFI && !HasFP,
-                          SVEStackSize + StackOffset::getFixed(
-                                             NumBytes + PrologueSaveSize));
-          NumBytes = 0;
-        }
-
-        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                        DeallocateBefore, TII, MachineInstr::FrameDestroy,
-                        false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
-                        SVEStackSize +
-                            StackOffset::getFixed(NumBytes + PrologueSaveSize));
-
-        emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                        DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
-                        DeallocateAfter +
-                            StackOffset::getFixed(NumBytes + PrologueSaveSize));
+      }
+      // The code below will deallocate the stack space space by moving the SP
+      // to the start of the SVE callee-save area.
+      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+                      -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
+    } else if (BaseForSVEDealloc == AArch64::SP) {
+      auto CFAOffset =
+          SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
+
+      if (SVECalleeSavesSize) {
+        // Deallocate the non-SVE locals first before we can deallocate (and
+        // restore callee saves) from the SVE area.
+        auto NonSVELocals = StackOffset::getFixed(NumBytes);
+        emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                        NonSVELocals, TII, MachineInstr::FrameDestroy, false,
+                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+        CFAOffset -= NonSVELocals;
+        NumBytes = 0;
       }
 
-      if (EmitCFI)
-        emitCalleeSavedSVERestores(RestoreEnd);
-    }
-  } else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
-    // TODO: Support stack realigment and variable-sized objects.
-    assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
-           "unexpected stack realignment or variable sized objects with split "
-           "SVE stack objects");
-    // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
-    // areas.
-    auto ZPRCalleeSavedSize =
-        StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
-    auto PPRCalleeSavedSize =
-        StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
-    StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
-    StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
-
-    MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
-                                PPRRestoreEnd = FirstGPRRestoreI;
-    if (PPRCalleeSavedSize) {
-      PPRRestoreBegin = std::prev(PPRRestoreEnd);
-      while (PPRRestoreBegin != MBB.begin() &&
-             isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
-        --PPRRestoreBegin;
-    }
-
-    MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
-                                ZPRRestoreEnd = PPRRestoreBegin;
-    if (ZPRCalleeSavedSize) {
-      ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
-      while (ZPRRestoreBegin != MBB.begin() &&
-             isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
-        --ZPRRestoreBegin;
-    }
-
-    auto CFAOffset =
-        SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
-    if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
-      // Deallocate the non-SVE locals first before we can deallocate (and
-      // restore callee saves) from the SVE area.
-      auto NonSVELocals = StackOffset::getFixed(NumBytes);
-      emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      NonSVELocals, TII, MachineInstr::FrameDestroy, false,
-                      false, nullptr, EmitCFI && !HasFP, CFAOffset);
-      NumBytes = 0;
-      CFAOffset -= NonSVELocals;
-    }
+      if (ZPR.LocalsSize) {
+        emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                        ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false,
+                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+        CFAOffset -= ZPR.LocalsSize;
+      }
 
-    if (ZPRLocalsSize) {
-      emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
-                      false, nullptr, EmitCFI && !HasFP, CFAOffset);
-      CFAOffset -= ZPRLocalsSize;
-    }
+      StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize;
+      if (SVELayout == SVEStackLayout::Split &&
+          (PPR.LocalsSize || ZPR.CalleeSavesSize)) {
+        assert(PPRRange.Begin == ZPRRange.End &&
+               "Expected PPR restores after ZPR");
+        emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                        PPR.LocalsSize + ZPR.CalleeSavesSize, TII,
+                        MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                        &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+        CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize;
+        SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize;
+      }
 
-    if (PPRLocalsSize || ZPRCalleeSavedSize) {
-      assert(PPRRestoreBegin == ZPRRestoreEnd &&
-             "Expected PPR restores after ZPR");
-      emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      PPRLocalsSize + ZPRCalleeSavedSize, TII,
-                      MachineInstr::FrameDestroy, false, false, nullptr,
-                      EmitCFI && !HasFP, CFAOffset);
-      CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
-    }
-    if (PPRCalleeSavedSize) {
-      emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
-                      PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
-                      false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
+      // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs:
+      if (SVECalleeSavesToDealloc)
+        emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
+                        SVECalleeSavesToDealloc, TII,
+                        MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                        &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
     }
 
-    // We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
     if (EmitCFI)
-      emitCalleeSavedSVERestores(ZPRRestoreEnd);
+      emitCalleeSavedSVERestores(
+          SVELayout == SVEStackLayout::Split ? ZPRRange.End : PPRRange.End);
   }
 
   if (!HasFP) {
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index a1c9b34..bccadda 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -27,11 +27,23 @@ class AArch64Subtarget;
 class AArch64FunctionInfo;
 class AArch64FrameLowering;
 
+struct SVEFrameSizes {
+  struct {
+    StackOffset CalleeSavesSize, LocalsSize;
+  } PPR, ZPR;
+};
+
 class AArch64PrologueEpilogueCommon {
 public:
   AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB,
                                 const AArch64FrameLowering &AFL);
 
+  enum class SVEStackLayout {
+    Default,
+    Split,
+    CalleeSavesAboveFrameRecord,
+  };
+
 protected:
   bool requiresGetVGCall() const;
 
@@ -53,6 +65,8 @@ protected:
 
   bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
 
+  SVEFrameSizes getSVEStackFrameSizes() const;
+
   MachineFunction &MF;
   MachineBasicBlock &MBB;
 
@@ -68,6 +82,7 @@ protected:
   bool IsFunclet = false;   // Note: Set in derived constructors.
   bool NeedsWinCFI = false; // Note: Can be changed in emitFramePointerSetup.
   bool HomPrologEpilog = false; // Note: Set in derived constructors.
+  SVEStackLayout SVELayout = SVEStackLayout::Default;
 
   // Note: "HasWinCFI" is mutable as it can change in any "emit" function.
   mutable bool HasWinCFI = false;
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 6d0529f..fb0928b8 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -110,8 +110,6 @@ def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true",
       "Allow GP-relative addressing of global variables">;
 def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
       "Enable generation of duplex instruction">;
-def FeatureUnsafeFP: SubtargetFeature<"unsafe-fp", "UseUnsafeMath", "true",
-      "Use unsafe FP math">;
 def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
       "true", "Reserve register R19">;
 def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
@@ -167,7 +165,6 @@ def UseHVXQFloat       : Predicate<"HST->useHVXQFloatOps()">,
 def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">;
 def HasMemNoShuf       : Predicate<"HST->hasMemNoShuf()">,
                          AssemblerPredicate<(all_of FeatureMemNoShuf)>;
-def UseUnsafeMath      : Predicate<"HST->useUnsafeMath()">;
 def NotOptTinyCore     : Predicate<"!HST->isTinyCore() ||"
                                    "MF->getFunction().hasOptSize()"> {
   let RecomputePerFunction = 1;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 4b23670..a0acfcf 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -1611,8 +1611,11 @@ def DfMpy: OutPatFrag<(ops node:$Rs, node:$Rt),
     $Rt, $Rs),
   $Rs, $Rt)>;
 
-let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in {
-  def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
+def fmul_afn : PatFrag<(ops node:$a, node:$b), (fmul node:$a, node:$b), [{
+  return N->getFlags().hasApproximateFuncs();
+}]>;
+let Predicates = [HasV67], AddedComplexity = 50 in {
+  def : Pat<(fmul_afn F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
 }
 let Predicates = [HasV67] in {
   def: OpR_RR_pat<F2_dfmin,     pf2<fminimumnum>, f64, F64>;
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index b111471..7430567 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -54,7 +54,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool UseNewValueJumps = false;
   bool UseNewValueStores = false;
   bool UseSmallData = false;
-  bool UseUnsafeMath = false;
   bool UseZRegOps = false;
   bool UseHVXIEEEFPOps = false;
   bool UseHVXQFloatOps = false;
@@ -234,7 +233,6 @@ public:
   bool useNewValueJumps() const { return UseNewValueJumps; }
   bool useNewValueStores() const { return UseNewValueStores; }
   bool useSmallData() const { return UseSmallData; }
-  bool useUnsafeMath() const { return UseUnsafeMath; }
   bool useZRegOps() const { return UseZRegOps; }
   bool useCabac() const { return UseCabac; }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 0afa04a..f5d8b69 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -250,13 +250,6 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
   std::string FS =
       FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
-  // Append the preexisting target features last, so that +mattr overrides
-  // the "unsafe-fp-math" function attribute.
-  // Creating a separate target feature is not strictly necessary, it only
-  // exists to make "unsafe-fp-math" force creating a new subtarget.
-
-  if (F.getFnAttribute("unsafe-fp-math").getValueAsBool())
-    FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS;
 
   auto &I = SubtargetMap[CPU + FS];
   if (!I) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index a29b7dd..e519b72 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -702,13 +702,13 @@ def : Pat<(binop_allwusers<or>
           (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
 def : Pat<(binop_allwusers<or>
                (or (zexti16 (XLenVT GPR:$rs1)),
-                   (shl GPR:$op1rs1, (XLenVT 24))),
-               (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
+                   (shl GPR:$op1rs2, (XLenVT 24))),
+               (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
           (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
 
 def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)),
-                       (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 16))),
-                   (sext_inreg (shl GPR:$op1rs1, (XLenVT 24)), i32))),
+                       (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+                   (sext_inreg (shl GPR:$op1rs2, (XLenVT 24)), i32))),
           (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
 
 // Match a pattern of 2 halfwords being inserted into bits [63:32], with bits
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index af79070..275165d 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -184,8 +184,8 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
   // Output the TLS marker if present.
   if ((unsigned)OpNum + 1 < MI->getNumOperands()) {
     const MCOperand &MO = MI->getOperand(OpNum + 1);
-    const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
-    switch (refExp.getSpecifier()) {
+    const MCSymbolRefExpr &RefExp = cast<MCSymbolRefExpr>(*MO.getExpr());
+    switch (RefExp.getSpecifier()) {
     case SystemZ::S_TLSGD:
       O << ":tls_gdcall:";
       break;
@@ -195,7 +195,7 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
     default:
       llvm_unreachable("Unexpected symbol kind");
     }
-    O << refExp.getSymbol().getName();
+    O << RefExp.getSymbol().getName();
   }
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index fce6393..8c31579 100644
--- a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -13,10 +13,9 @@
 
 using namespace llvm;
 
-SystemZConstantPoolValue::
-SystemZConstantPoolValue(const GlobalValue *gv,
-                         SystemZCP::SystemZCPModifier modifier)
-  : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {}
+SystemZConstantPoolValue::SystemZConstantPoolValue(
+    const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier)
+    : MachineConstantPoolValue(GV->getType()), GV(GV), Modifier(Modifier) {}
 
 SystemZConstantPoolValue *
 SystemZConstantPoolValue::Create(const GlobalValue *GV,
diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 34d58e0..5313fba 100644
--- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -352,10 +352,9 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
   // Similarly, a group-ending SU may either fit well (last in group), or
   // end the group prematurely.
   if (SC->EndGroup) {
-    unsigned resultingGroupSize =
-      (CurrGroupSize + getNumDecoderSlots(SU));
-    if (resultingGroupSize < 3)
-      return (3 - resultingGroupSize);
+    unsigned ResultingGroupSize = (CurrGroupSize + getNumDecoderSlots(SU));
+    if (ResultingGroupSize < 3)
+      return (3 - ResultingGroupSize);
     return -1;
   }
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index 9115946..f166fef 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -24,6 +24,9 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -33,6 +36,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "coro-annotation-elide"
 
+static cl::opt<float> CoroElideBranchRatio(
+    "coro-elide-branch-ratio", cl::init(0.55), cl::Hidden,
+    cl::desc("Minimum BranchProbability to consider a elide a coroutine."));
+extern cl::opt<unsigned> MinBlockCounterExecution;
+
 static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
   for (Instruction &I : F->getEntryBlock())
     if (!isa<AllocaInst>(&I))
@@ -145,6 +153,30 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
       bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
       bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
       if (IsCallerPresplitCoroutine && HasAttr) {
+        BranchProbability MinBranchProbability(
+            static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
+            MinBlockCounterExecution);
+
+        auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+
+        auto Prob = BranchProbability::getBranchProbability(
+            BFI.getBlockFreq(CB->getParent()).getFrequency(),
+            BFI.getEntryFreq().getFrequency());
+
+        if (Prob < MinBranchProbability) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(
+                       DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
+                   << "'" << ore::NV("callee", Callee->getName())
+                   << "' not elided in '"
+                   << ore::NV("caller", Caller->getName())
+                   << "' because of low probability: "
+                   << ore::NV("probability", Prob) << " (threshold: "
+                   << ore::NV("threshold", MinBranchProbability) << ")";
+          });
+          continue;
+        }
+
         auto *CallerN = CG.lookup(*Caller);
         auto *CallerC = CallerN ? CG.lookupSCC(*CallerN) : nullptr;
         // If CallerC is nullptr, it means LazyCallGraph hasn't visited Caller
@@ -156,7 +188,7 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
           return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
                  << "'" << ore::NV("callee", Callee->getName())
                  << "' elided in '" << ore::NV("caller", Caller->getName())
-                 << "'";
+                 << "' (probability: " << ore::NV("probability", Prob) << ")";
         });
 
         FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 2583249..1a00d17 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -109,7 +109,7 @@ static cl::opt<float> MinRegionSizeRatio(
              "outline candidate and original function"));
 // Used to tune the minimum number of execution counts needed in the predecessor
 // block to the cold edge. ie. confidence interval.
-static cl::opt<unsigned>
+cl::opt<unsigned>
     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
                              cl::desc("Minimum block executions to consider "
                                       "its BranchProbabilityInfo valid"));
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 5b8ea15..b74a070 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1084,8 +1084,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     auto ThenTerm = SplitBlockAndInsertIfThen(
         IRB.CreateIsNull(Load), &*IP, false,
         MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
-    IRBuilder<> ThenIRB(ThenTerm);
+    InstrumentationIRBuilder ThenIRB(ThenTerm);
     auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
+    if (EntryLoc)
+      Store->setDebugLoc(EntryLoc);
     Load->setNoSanitizeMetadata();
     Store->setNoSanitizeMetadata();
   }
@@ -1131,7 +1133,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
           EstimatedStackSize >= Options.StackDepthCallbackMin) {
         if (InsertBefore)
           IRB.SetInsertPoint(InsertBefore);
-        IRB.CreateCall(SanCovStackDepthCallback)->setCannotMerge();
+        auto Call = IRB.CreateCall(SanCovStackDepthCallback);
+        if (EntryLoc)
+          Call->setDebugLoc(EntryLoc);
+        Call->setCannotMerge();
       }
     } else {
       // Check stack depth.  If it's the deepest so far, record it.
@@ -1144,8 +1149,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
       auto ThenTerm = SplitBlockAndInsertIfThen(
           IsStackLower, &*IP, false,
           MDBuilder(IRB.getContext()).createUnlikelyBranchWeights());
-      IRBuilder<> ThenIRB(ThenTerm);
+      InstrumentationIRBuilder ThenIRB(ThenTerm);
       auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+      if (EntryLoc)
+        Store->setDebugLoc(EntryLoc);
       LowestStack->setNoSanitizeMetadata();
       Store->setNoSanitizeMetadata();
     }
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index e448230..3f7003d 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -61,6 +61,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
@@ -382,16 +383,9 @@ typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap;
 typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap;
 
 inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
-  OS << "< ";
-  for (const BasicBlock *BB : Path) {
-    std::string BBName;
-    if (BB->hasName())
-      raw_string_ostream(BBName) << BB->getName();
-    else
-      raw_string_ostream(BBName) << BB;
-    OS << BBName << " ";
-  }
-  OS << ">";
+  auto BBNames = llvm::map_range(
+      Path, [](const BasicBlock *BB) { return BB->getNameOrAsOperand(); });
+  OS << "< " << llvm::join(BBNames, ", ") << " >";
   return OS;
 }
 
@@ -423,7 +417,7 @@ struct ThreadingPath {
   }
 
   void print(raw_ostream &OS) const {
-    OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]";
+    OS << Path << " [ " << ExitVal << ", " << DBB->getNameOrAsOperand() << " ]";
   }
 
 private:
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll
index 8d091a0..d380104 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll
@@ -61,7 +61,7 @@ define void @umin(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:  Loop %for.body: backedge-taken count is (-1 + ((2 * %a) umin (4 * %b)))
 ; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 2147483646
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-1 + ((2 * %a) umin (4 * %b)))
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 ; void umin(unsigned a, unsigned b) {
 ;   a *= 2;
@@ -157,7 +157,7 @@ define void @smin(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:  Loop %for.body: backedge-taken count is (-1 + ((2 * %a)<nsw> smin (4 * %b)<nsw>))
 ; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 2147483646
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-1 + ((2 * %a)<nsw> smin (4 * %b)<nsw>))
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 ; void smin(signed a, signed b) {
 ;   a *= 2;
diff --git a/llvm/test/CodeGen/Hexagon/fmul-v67.ll b/llvm/test/CodeGen/Hexagon/fmul-v67.ll
index 49098cd..fc0b7f7 100644
--- a/llvm/test/CodeGen/Hexagon/fmul-v67.ll
+++ b/llvm/test/CodeGen/Hexagon/fmul-v67.ll
@@ -29,7 +29,7 @@ b2:
 ; CHECK: [[R22]] += dfmpylh([[R20]],[[R21]])
 ; CHECK: [[R22]] += dfmpylh([[R21]],[[R20]])
 ; CHECK: [[R22]] += dfmpyhh([[R20]],[[R21]])
-define double @test_02(double %a0, double %a1) #2 {
+define double @test_02(double %a0, double %a1) #1 {
 b2:
   %v3 = fmul double %a0, %a1
   ret double %v3
@@ -40,13 +40,11 @@ b2:
 ; CHECK: [[R30]] += dfmpylh(r1:0,r3:2)
 ; CHECK: [[R30]] += dfmpylh(r3:2,r1:0)
 ; CHECK: [[R30]] += dfmpyhh(r1:0,r3:2)
-define double @test_03(double %a0, double %a1) #3 {
+define double @test_03(double %a0, double %a1) #1 {
 b2:
-  %v3 = fmul double %a0, %a1
+  %v3 = fmul afn double %a0, %a1
   ret double %v3
 }
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "target-cpu"="hexagonv67" }
-attributes #2 = { nounwind "target-cpu"="hexagonv67" "unsafe-fp-math"="false" }
-attributes #3 = { nounwind "target-cpu"="hexagonv67" "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
index 41a0e81..1edb387 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -12,63 +12,104 @@ declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols)
 declare void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols)
 declare void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols)
 
-; CHECK-LABEL: test_tcgen05_alloc
-define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) {
-; CHECK_PTX64-LABEL: test_tcgen05_alloc(
+define void @test_tcgen05_alloc_cg1(ptr %addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_alloc_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-NEXT:    .reg .b32 %r<2>;
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_param_0];
-; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_cg1_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_cg1_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1;
-; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_cg1_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_cg1_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1;
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
-  call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols)
+  ret void
+}
 
+define void @test_tcgen05_alloc_cg2(ptr %addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_alloc_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_cg2_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_cg2_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_cg2_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_cg2_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
+  call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols)
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_alloc_shared
-define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) {
-; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared(
+define void @test_tcgen05_alloc_shared_cg1(ptr addrspace(3) %addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-NEXT:    .reg .b32 %r<2>;
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_shared_param_0];
-; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_shared_cg1_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg1_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%rd1], %r1;
-; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1;
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_alloc_shared_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg1_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_alloc_shared_cg1_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%r1], %r2;
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols)
+  ret void
+}
 
+define void @test_tcgen05_alloc_shared_cg2(ptr addrspace(3) %addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_shared_cg2_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg2_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_cg2_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_alloc_shared_cg2_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols)
   ret void
 }
@@ -76,31 +117,50 @@ define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) {
 declare void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols)
 declare void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols)
 
-; CHECK-LABEL: test_tcgen05_dealloc
-define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) {
-; CHECK_PTX64-LABEL: test_tcgen05_dealloc(
+define void @test_tcgen05_dealloc_cg1(ptr addrspace(6) %tmem_addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_dealloc_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-NEXT:    .reg .b32 %r<3>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_param_0];
-; CHECK_PTX64-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_cg1_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_cg1_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2;
-; CHECK_PTX64-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_cg1_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_cg1_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2;
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols)
+  ret void
+}
 
+define void @test_tcgen05_dealloc_cg2(ptr addrspace(6) %tmem_addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_dealloc_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b32 %r<3>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_cg2_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_cg2_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_cg2_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_cg2_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols)
   ret void
 }
@@ -108,27 +168,42 @@ define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) {
 declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1()
 declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2()
 
-; CHECK-LABEL: test_tcgen05_relinquish_alloc_permit
-define void @test_tcgen05_relinquish_alloc_permit() {
-; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit(
+define void @test_tcgen05_relinquish_alloc_permit_cg1() {
+; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
 ; CHECK_PTX64-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;
-; CHECK_PTX64-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1()
+  ret void
+}
 
+define void @test_tcgen05_relinquish_alloc_permit_cg2() {
+; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2()
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
index 7981feb..2e80c4c 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
@@ -11,57 +11,93 @@ declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
 declare void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr)
 declare void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr)
 
-; CHECK-LABEL: test_tcgen05_commit
-define void @test_tcgen05_commit(ptr %bar_addr) {
-; CHECK_PTX64-LABEL: test_tcgen05_commit(
+define void @test_tcgen05_commit_cg1(ptr %bar_addr) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_cg1_param_0];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
-; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_cg1_param_0];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
 
+  ret void
+}
+
+define void @test_tcgen05_commit_cg2(ptr %bar_addr) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_cg2_param_0];
+; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_cg2_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
+; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_commit_shared
-define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) {
-; CHECK_PTX64-LABEL: test_tcgen05_commit_shared(
+define void @test_tcgen05_commit_shared_cg1(ptr addrspace(3) %bar_addr) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_shared_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_shared_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_shared_cg1_param_0];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
-; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_shared_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_shared_cg1_param_0];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%r1];
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1];
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr)
 
+  ret void
+}
+
+define void @test_tcgen05_commit_shared_cg2(ptr addrspace(3) %bar_addr) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_shared_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_shared_cg2_param_0];
+; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_shared_cg2_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1];
+; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr)
 
   ret void
@@ -72,66 +108,106 @@ declare void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask)
 declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask)
 declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask)
 
-; CHECK-LABEL: test_tcgen05_commit_mc
-define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) {
-; CHECK_PTX64-LABEL: test_tcgen05_commit_mc(
+define void @test_tcgen05_commit_mc_cg1(ptr %bar_addr, i16 %cta_mask) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-NEXT:    .reg .b16 %rs<2>;
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0];
-; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg1_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg1_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
-; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b16 %rs<2>;
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg1_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg1_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %bar_addr, i16 %cta_mask)
+  ret void
+}
 
+define void @test_tcgen05_commit_mc_cg2(ptr %bar_addr, i16 %cta_mask) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg2_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg2_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_cg2_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_cg2_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask)
-
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_commit_mc_shared
-define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_mask) {
-; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared(
+define void @test_tcgen05_commit_mc_shared_cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared_cg1(
 ; CHECK_PTX64:       {
 ; CHECK_PTX64-NEXT:    .reg .b16 %rs<2>;
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_param_0];
-; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_cg1_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg1_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
-; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64-NEXT:    ret;
 ;
-; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared(
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared_cg1(
 ; CHECK_PTX64_SHARED32:       {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b16 %rs<2>;
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_cg1_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg1_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1;
-; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask)
+  ret void
+}
 
+define void @test_tcgen05_commit_mc_shared_cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask) {
+; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared_cg2(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_cg2_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg2_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared_cg2(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_cg2_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_cg2_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask)
-
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
index c540f78..817b1d5 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
@@ -4,346 +4,580 @@
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
 
-; CHECK-LABEL: test_tcgen05_cp_64x128_v1
-define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_64x128_v1(
+define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_64x128_v1_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_64x128_v2
-define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_64x128_v2(
+define void @test_tcgen05_cp_64x128_v2_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v2_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_64x128_v2_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v2_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_32x128
-define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_32x128(
+define void @test_tcgen05_cp_32x128_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_32x128_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_32x128_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_32x128_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
 
-; CHECK-LABEL: test_tcgen05_cp_128x128b
-define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_128x128b(
+define void @test_tcgen05_cp_128x128b_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x128b_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x128b.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_128x128b_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x128b_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x128b.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_128x256b
-define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_128x256b(
+define void @test_tcgen05_cp_128x256b_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x256b_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_128x256b_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x256b_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_4x256b
-define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_4x256b(
+define void @test_tcgen05_cp_4x256b_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_4x256b_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.4x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_4x256b_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_4x256b_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.4x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
 ; With src_fmt as b6x16_p32
-; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32
-define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32(
+define void @test_tcgen05_cp_128x256b_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_128x256b_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32
-define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32(
+define void @test_tcgen05_cp_4x256b_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_4x256b_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32
-define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32(
+define void @test_tcgen05_cp_128x128b_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_128x128b_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32
-define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32(
+define void @test_tcgen05_cp_64x128_v1_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_64x128_v1_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32
-define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32(
+define void @test_tcgen05_cp_64x128_v2_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_64x128_v2_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32
-define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32(
+define void @test_tcgen05_cp_32x128_b6x16_p32_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_32x128_b6x16_p32_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
 ; With src_fmt as b4x16_p64
-; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64
-define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64(
+define void @test_tcgen05_cp_128x256b_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_128x256b_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64
-define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64(
+define void @test_tcgen05_cp_4x256b_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_4x256b_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64
-define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64(
+define void @test_tcgen05_cp_128x128b_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_128x128b_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64
-define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64(
+define void @test_tcgen05_cp_64x128_v1_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_64x128_v1_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64
-define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64(
+define void @test_tcgen05_cp_64x128_v2_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_64x128_v2_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
 }
 
-; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64
-define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) {
-; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64(
+define void @test_tcgen05_cp_32x128_b4x16_p64_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_cg1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1;
-; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc)
+
+  ret void
+}
+
+define void @test_tcgen05_cp_32x128_b4x16_p64_cg2(ptr addrspace(6) %addr, i64 %sdesc) {
+; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_cg2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_cg2_param_1];
+; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1;
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc)
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
index 8ca6a2a0..bf2adac 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
@@ -7,18 +7,29 @@
 declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
 declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
 
-; CHECK-LABEL: test_tcgen05_shift
-define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) {
-; CHECK-LABEL: test_tcgen05_shift(
+define void @test_tcgen05_shift_cg1(ptr addrspace(6) %tmem_addr) {
+; CHECK-LABEL: test_tcgen05_shift_cg1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_shift_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_shift_cg1_param_0];
 ; CHECK-NEXT:    tcgen05.shift.cta_group::1.down [%r1];
-; CHECK-NEXT:    tcgen05.shift.cta_group::2.down [%r1];
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
+
+  ret void
+}
+
+define void @test_tcgen05_shift_cg2(ptr addrspace(6) %tmem_addr) {
+; CHECK-LABEL: test_tcgen05_shift_cg2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_shift_cg2_param_0];
+; CHECK-NEXT:    tcgen05.shift.cta_group::2.down [%r1];
+; CHECK-NEXT:    ret;
   call void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
 
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 4537d18..b2ad8d7 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -441,7 +441,7 @@ define void @pack_lo_packh_hi_packh_2(i8 zeroext %0, i8 zeroext %1, i8 zeroext %
 ; RV64ZBKB-LABEL: pack_lo_packh_hi_packh_2:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    packh a0, a0, a1
-; RV64ZBKB-NEXT:    packh a1, a3, a2
+; RV64ZBKB-NEXT:    packh a1, a2, a3
 ; RV64ZBKB-NEXT:    packw a0, a0, a1
 ; RV64ZBKB-NEXT:    sw a0, 0(a4)
 ; RV64ZBKB-NEXT:    ret
@@ -477,7 +477,7 @@ define void @pack_lo_packh_hi_packh_3(i8 %0, i8 %1, i8 %2, i8 %3, ptr %p) nounwi
 ; RV64ZBKB-LABEL: pack_lo_packh_hi_packh_3:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    packh a0, a0, a1
-; RV64ZBKB-NEXT:    packh a1, a3, a2
+; RV64ZBKB-NEXT:    packh a1, a2, a3
 ; RV64ZBKB-NEXT:    packw a0, a0, a1
 ; RV64ZBKB-NEXT:    sw a0, 0(a4)
 ; RV64ZBKB-NEXT:    ret
@@ -509,7 +509,7 @@ define i32 @pack_lo_packh_hi_packh_4(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2
 ; RV64ZBKB-LABEL: pack_lo_packh_hi_packh_4:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    packh a0, a0, a1
-; RV64ZBKB-NEXT:    packh a1, a3, a2
+; RV64ZBKB-NEXT:    packh a1, a2, a3
 ; RV64ZBKB-NEXT:    packw a0, a0, a1
 ; RV64ZBKB-NEXT:    ret
   %a = zext i8 %0 to i32
diff --git a/llvm/test/Instrumentation/AddressSanitizer/alloca-offset-lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/alloca-offset-lifetime.ll
deleted file mode 100644
index a4846176..0000000
--- a/llvm/test/Instrumentation/AddressSanitizer/alloca-offset-lifetime.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; Test that ASAN will not instrument lifetime markers on alloca offsets.
-;
-; RUN: opt < %s -passes=asan --asan-use-after-scope -S | FileCheck %s
-
-target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.15.0"
-
-%t = type { ptr, ptr, %sub, i64 }
-%sub = type { i32 }
-
-define void @foo() sanitize_address {
-entry:
-  %0 = alloca %t, align 8
-  %x = getelementptr inbounds %t, ptr %0, i64 0, i32 2
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %x)
-  call void @bar(ptr nonnull %x)
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %x) #3
-  ret void
-}
-
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-declare void @bar(ptr)
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
-
-; CHECK: store i64 %[[STACK_BASE:.+]], ptr %asan_local_stack_base, align 8
-; CHECK-NOT: store i8 0
-; CHECK: call void @bar(ptr nonnull %x)
diff --git a/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll b/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll
index 0859a7e..d7204e6 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/calls-only-smallfn.ll
@@ -9,15 +9,15 @@ define void @foo() #0 {
 entry:
   %array01 = alloca [1 x i8], align 1
   %array02 = alloca [2 x i8], align 1
-; OUTLINE:  call void @__asan_set_shadow_f1(i64 %23, i64 4)
-; OUTLINE:  call void @__asan_set_shadow_01(i64 %24, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f2(i64 %25, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_02(i64 %26, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f3(i64 %27, i64 1)
-; OUTLINE:  call void @__asan_stack_free_0(i64 %7, i64 64)
-; OUTLINE:  call void @__asan_set_shadow_00(i64 %55, i64 8)
-; INLINE:  store i64 -935919682371587599, ptr %24, align 1
-; INLINE:  store i64 -723401728380766731, ptr %52, align 1
+; OUTLINE:  call void @__asan_set_shadow_f1(i64 %{{.+}}, i64 4)
+; OUTLINE:  call void @__asan_set_shadow_01(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f2(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_02(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f3(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_stack_free_0(i64 %{{.+}}, i64 64)
+; OUTLINE:  call void @__asan_set_shadow_00(i64 %{{.+}}, i64 8)
+; INLINE:  store i64 -935919682371587599, ptr %{{.+}}, align 1
+; INLINE:  store i64 -723401728380766731, ptr %{{.+}}, align 1
   %arrayidx = getelementptr inbounds [1 x i8], ptr %array01, i64 0, i64 1
   store i8 1, ptr %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds [2 x i8], ptr %array02, i64 0, i64 2
diff --git a/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll b/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll
index 5f122ad..6f52289 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/calls-only.ll
@@ -14,26 +14,26 @@ entry:
   %array05 = alloca [5 x i8], align 1
   %array06 = alloca [6 x i8], align 1
   %array07 = alloca [7 x i8], align 1
-; OUTLINE:  call void @__asan_set_shadow_f1(i64 %33, i64 4)
-; OUTLINE:  call void @__asan_set_shadow_01(i64 %34, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f2(i64 %35, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_02(i64 %36, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f2(i64 %37, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_03(i64 %38, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f2(i64 %39, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_04(i64 %40, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f2(i64 %41, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_05(i64 %42, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f2(i64 %43, i64 3)
-; OUTLINE:  call void @__asan_set_shadow_06(i64 %44, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f2(i64 %45, i64 3)
-; OUTLINE:  call void @__asan_set_shadow_07(i64 %46, i64 1)
-; OUTLINE:  call void @__asan_set_shadow_f3(i64 %47, i64 3)
-; OUTLINE:  call void @__asan_stack_free_2(i64 %7, i64 192)
-; OUTLINE:  call void @__asan_set_shadow_00(i64 %135, i64 24)
-; INLINE:  store i64 -1007977276409515535, ptr %34, align 1
-; INLINE:  store i64 -940423264817843709, ptr %36, align 1
-; INLINE:  store i64 -868083087686045178, ptr %38, align 1
+; OUTLINE:  call void @__asan_set_shadow_f1(i64 %{{.+}}, i64 4)
+; OUTLINE:  call void @__asan_set_shadow_01(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f2(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_02(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f2(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_03(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f2(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_04(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f2(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_05(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f2(i64 %{{.+}}, i64 3)
+; OUTLINE:  call void @__asan_set_shadow_06(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f2(i64 %{{.+}}, i64 3)
+; OUTLINE:  call void @__asan_set_shadow_07(i64 %{{.+}}, i64 1)
+; OUTLINE:  call void @__asan_set_shadow_f3(i64 %{{.+}}, i64 3)
+; OUTLINE:  call void @__asan_stack_free_2(i64 %{{.+}}, i64 192)
+; OUTLINE:  call void @__asan_set_shadow_00(i64 %{{.+}}, i64 24)
+; INLINE:  store i64 -1007977276409515535, ptr %{{.+}}, align 1
+; INLINE:  store i64 -940423264817843709, ptr %{{.+}}, align 1
+; INLINE:  store i64 -868083087686045178, ptr %{{.+}}, align 1
   %arrayidx = getelementptr inbounds [1 x i8], ptr %array01, i64 0, i64 1
   store i8 1, ptr %arrayidx, align 1
   %arrayidx1 = getelementptr inbounds [2 x i8], ptr %array02, i64 0, i64 2
@@ -48,7 +48,7 @@ entry:
   store i8 6, ptr %arrayidx5, align 1
   %arrayidx6 = getelementptr inbounds [7 x i8], ptr %array07, i64 0, i64 7
   store i8 7, ptr %arrayidx6, align 1
-; CHECK-NOT:  store i64 -723401728380766731, ptr %126, align 1
+; CHECK-NOT:  store i64 -723401728380766731, ptr %{{.+}}, align 1
   ret void
 }
 attributes #0 = { noinline nounwind optnone sanitize_address ssp uwtable(sync) "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll b/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll
index 3568434..07b9a1c 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/missing_dbg.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=2 -S | FileCheck %s
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -sanitizer-coverage-stack-depth-callback-min=1 -S | FileCheck %s --check-prefix=CHECK-STACK-CALLBACK
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -S | FileCheck %s --check-prefix=CHECK-STACK-DEPTH
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -55,6 +57,86 @@ entry:
   ret i32 %t
 }
 
+define i32 @with_dbg_stack_callback(ptr %a) !dbg !8 {
+; CHECK-STACK-CALLBACK-LABEL: define i32 @with_dbg_stack_callback(
+; CHECK-STACK-CALLBACK-SAME: ptr [[A:%.*]]) !dbg [[DBG8:![0-9]+]] {
+; CHECK-STACK-CALLBACK-NEXT:  entry:
+; CHECK-STACK-CALLBACK-NEXT:    [[BUF:%.*]] = alloca [64 x i8], align 1
+; CHECK-STACK-CALLBACK-NEXT:    call void @__sanitizer_cov_stack_depth() #[[ATTR1:[0-9]+]], !dbg [[DBG9:![0-9]+]]
+; CHECK-STACK-CALLBACK-NEXT:    %t = load i32, ptr [[A]], align 4
+; CHECK-STACK-CALLBACK-NEXT:    call void @external_func()
+; CHECK-STACK-CALLBACK-NEXT:    ret i32 %t
+;
+entry:
+  %buf = alloca [64 x i8], align 1
+  %t = load i32, ptr %a, align 4
+  call void @external_func()
+  ret i32 %t
+}
+
+define i32 @with_dbg_stack_depth(ptr %a) !dbg !10 {
+; CHECK-STACK-DEPTH-LABEL: define i32 @with_dbg_stack_depth(
+; CHECK-STACK-DEPTH-SAME: ptr [[A:%.*]]) !dbg [[DBG10:![0-9]+]] {
+; CHECK-STACK-DEPTH-NEXT:  entry:
+; CHECK-STACK-DEPTH-NEXT:    [[BUF:%.*]] = alloca [64 x i8], align 1
+; CHECK-STACK-DEPTH-NEXT:    [[TMP1:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
+; CHECK-STACK-DEPTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-STACK-DEPTH-NEXT:    [[TMP3:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8
+; CHECK-STACK-DEPTH-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP2]], [[TMP3]]
+; CHECK-STACK-DEPTH-NEXT:    br i1 [[TMP4]], label {{%.*}}, label {{%.*}}
+; CHECK-STACK-DEPTH:       store i64 [[TMP2]], ptr @__sancov_lowest_stack, align 8, !dbg [[DBG11:![0-9]+]], {{.*}}!nosanitize
+; CHECK-STACK-DEPTH:       %t = load i32, ptr [[A]], align 4
+; CHECK-STACK-DEPTH-NEXT:    call void @external_func()
+; CHECK-STACK-DEPTH-NEXT:    ret i32 %t
+;
+entry:
+  %buf = alloca [64 x i8], align 1
+  %t = load i32, ptr %a, align 4
+  call void @external_func()
+  ret i32 %t
+}
+
+define i32 @without_dbg_stack_callback(ptr %a) {
+; CHECK-STACK-CALLBACK-LABEL: define i32 @without_dbg_stack_callback(
+; CHECK-STACK-CALLBACK-SAME: ptr [[A:%.*]]) {
+; CHECK-STACK-CALLBACK-NEXT:  entry:
+; CHECK-STACK-CALLBACK-NEXT:    [[BUF:%.*]] = alloca [64 x i8], align 1
+; CHECK-STACK-CALLBACK-NEXT:    call void @__sanitizer_cov_stack_depth() #[[ATTR1]]
+; CHECK-STACK-CALLBACK-NEXT:    %t = load i32, ptr [[A]], align 4
+; CHECK-STACK-CALLBACK-NEXT:    call void @external_func()
+; CHECK-STACK-CALLBACK-NEXT:    ret i32 %t
+;
+entry:
+  %buf = alloca [64 x i8], align 1
+  %t = load i32, ptr %a, align 4
+  call void @external_func()
+  ret i32 %t
+}
+
+define i32 @without_dbg_stack_depth(ptr %a) {
+; CHECK-STACK-DEPTH-LABEL: define i32 @without_dbg_stack_depth(
+; CHECK-STACK-DEPTH-SAME: ptr [[A:%.*]]) {
+; CHECK-STACK-DEPTH-NEXT:  entry:
+; CHECK-STACK-DEPTH-NEXT:    [[BUF:%.*]] = alloca [64 x i8], align 1
+; CHECK-STACK-DEPTH-NEXT:    [[TMP1:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
+; CHECK-STACK-DEPTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-STACK-DEPTH-NEXT:    [[TMP3:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8
+; CHECK-STACK-DEPTH-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP2]], [[TMP3]]
+; CHECK-STACK-DEPTH-NEXT:    br i1 [[TMP4]], label {{%.*}}, label {{%.*}}
+; CHECK-STACK-DEPTH:       store i64 [[TMP2]], ptr @__sancov_lowest_stack, align 8, {{.*}}!nosanitize
+; CHECK-STACK-DEPTH:       %t = load i32, ptr [[A]], align 4
+; CHECK-STACK-DEPTH-NEXT:    call void @external_func()
+; CHECK-STACK-DEPTH-NEXT:    ret i32 %t
+;
+entry:
+  %buf = alloca [64 x i8], align 1
+  %t = load i32, ptr %a, align 4
+  call void @external_func()
+  ret i32 %t
+}
+
+declare void @external_func()
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2}
 
@@ -66,6 +148,10 @@ entry:
 !5 = !{}
 !6 = !DILocation(line: 192, scope: !3)
 !7 = !DILocation(line: 0, scope: !3)
+!8 = distinct !DISubprogram(name: "with_dbg_stack_callback", scope: !1, file: !1, line: 200, type: !4, scopeLine: 200, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!9 = !DILocation(line: 200, scope: !8)
+!10 = distinct !DISubprogram(name: "with_dbg_stack_depth", scope: !1, file: !1, line: 210, type: !4, scopeLine: 210, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = !DILocation(line: 210, scope: !10)
 
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C89, file: [[META1:![0-9]+]], isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
@@ -76,3 +162,9 @@ entry:
 ; CHECK: [[DBG6]] = !DILocation(line: 192, scope: [[DBG3]])
 ; CHECK: [[DBG7]] = !DILocation(line: 0, scope: [[DBG3]])
 ;.
+; CHECK-STACK-CALLBACK: [[DBG8]] = distinct !DISubprogram(name: "with_dbg_stack_callback", scope: {{.*}}, file: {{.*}}, line: 200
+; CHECK-STACK-CALLBACK: [[DBG9]] = !DILocation(line: 200, scope: [[DBG8]])
+;.
+; CHECK-STACK-DEPTH: [[DBG10]] = distinct !DISubprogram(name: "with_dbg_stack_depth", scope: {{.*}}, file: {{.*}}, line: 210
+; CHECK-STACK-DEPTH: [[DBG11]] = !DILocation(line: 210, scope: [[DBG10]])
+;.
diff --git a/llvm/test/Transforms/Coroutines/coro-transform-must-elide.ll b/llvm/test/Transforms/Coroutines/coro-elide-safe.ll
index 4eec7ed..722693d 100644
--- a/llvm/test/Transforms/Coroutines/coro-transform-must-elide.ll
+++ b/llvm/test/Transforms/Coroutines/coro-elide-safe.ll
@@ -1,4 +1,8 @@
-; Testing elide performed its job for calls to coroutines marked safe.
+; Coroutine calls marked with `coro_elide_safe` should be elided.
+; Inside `caller`, we expect the `callee` coroutine to be elided.
+; Inside `caller_conditional`, `callee` is only called on an unlikely
+; path, hence we expect the `callee` coroutine NOT to be elided.
+;
 ; RUN: opt < %s -S -passes='cgscc(coro-annotation-elide)' | FileCheck %s
 
 %struct.Task = type { ptr }
@@ -57,7 +61,7 @@ define ptr @callee.noalloc(i8 %arg, ptr dereferenceable(32) align(8) %frame) {
 ; Function Attrs: presplitcoroutine
 define ptr @caller() #0 {
 entry:
-  %task = call ptr @callee(i8 0) #1
+  %task = call ptr @callee(i8 0) coro_elide_safe
   ret ptr %task
   ; CHECK: %[[TASK:.+]] = alloca %struct.Task, align 8
   ; CHECK-NEXT: %[[FRAME:.+]] = alloca [32 x i8], align 8
@@ -69,6 +73,25 @@ entry:
   ; CHECK-NEXT: ret ptr %[[TASK]]
 }
 
+; CHECK-LABEL: define ptr @caller_conditional(i1 %cond)
+; Function Attrs: presplitcoroutine
+define ptr @caller_conditional(i1 %cond) #0 {
+entry:
+  br i1 %cond, label %call, label %ret
+
+call:
+  ; CHECK-NOT: alloca
+  ; CHECK-NOT: @llvm.coro.id({{.*}}, ptr @callee, {{.*}})
+  ; CHECK: %task = call ptr @callee(i8 0)
+  ; CHECK-NEXT: br label %ret
+  %task = call ptr @callee(i8 0) coro_elide_safe
+  br label %ret
+
+ret:
+  %retval = phi ptr [ %task, %call ], [ null, %entry ]
+  ret ptr %retval
+}
+
 declare token @llvm.coro.id(i32, ptr, ptr, ptr)
 declare ptr @llvm.coro.begin(token, ptr)
 declare ptr @llvm.coro.frame()
@@ -76,4 +99,3 @@ declare ptr @llvm.coro.subfn.addr(ptr, i8)
 declare i1 @llvm.coro.alloc(token)
 
 attributes #0 = { presplitcoroutine }
-attributes #1 = { coro_elide_safe }
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
index 4173c32..f45798b 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
@@ -7,10 +7,10 @@
 ; state, and the block that determines the next state.
 ; < path of BBs that form a cycle > [ state, determinator ]
 define i32 @test1(i32 %num) !prof !0{
-; CHECK: < case2 for.inc for.body > [ 1, for.inc ]
-; CHECK-NEXT: < for.inc for.body > [ 1, for.inc ]
-; CHECK-NEXT: < case1 for.inc for.body > [ 2, for.inc ]
-; CHECK-NEXT: < case2 sel.si.unfold.false for.inc for.body > [ 2, sel.si.unfold.false ]
+; CHECK: < case2, for.inc, for.body > [ 1, for.inc ]
+; CHECK-NEXT: < for.inc, for.body > [ 1, for.inc ]
+; CHECK-NEXT: < case1, for.inc, for.body > [ 2, for.inc ]
+; CHECK-NEXT: < case2, sel.si.unfold.false, for.inc, for.body > [ 2, sel.si.unfold.false ]
 entry:
   br label %for.body
 
@@ -47,12 +47,12 @@ for.end:
 ; complicated CFG. Here the FSM is represented as a nested loop, with
 ; fallthrough cases.
 define i32 @test2(i32 %init) {
-; CHECK: < loop.1.backedge loop.1 loop.2 loop.3 > [ 1, loop.1 ]
-; CHECK-NEXT: < case4 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 2, loop.1.backedge ]
-; CHECK-NEXT: < case2 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 4, loop.1.backedge ]
-; CHECK-NEXT: < case4 loop.2.backedge loop.2 loop.3 > [ 3, loop.2.backedge ]
-; CHECK-NEXT: < case3 loop.2.backedge loop.2 loop.3 > [ 0, loop.2.backedge ]
-; CHECK-NEXT: < case2 loop.3 > [ 3, loop.3 ]
+; CHECK: < loop.1.backedge, loop.1, loop.2, loop.3 > [ 1, loop.1 ]
+; CHECK-NEXT: < case4, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 2, loop.1.backedge ]
+; CHECK-NEXT: < case2, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 4, loop.1.backedge ]
+; CHECK-NEXT: < case4, loop.2.backedge, loop.2, loop.3 > [ 3, loop.2.backedge ]
+; CHECK-NEXT: < case3, loop.2.backedge, loop.2, loop.3 > [ 0, loop.2.backedge ]
+; CHECK-NEXT: < case2, loop.3 > [ 3, loop.3 ]
 entry:
   %cmp = icmp eq i32 %init, 0
   %sel = select i1 %cmp, i32 0, i32 2
@@ -187,12 +187,12 @@ bb66:                                             ; preds = %bb59
 
 ; Value %init is not predictable but it's okay since it is the value initial to the switch.
 define i32 @initial.value.positive1(i32 %init) !prof !0 {
-; CHECK: < loop.1.backedge loop.1 loop.2 loop.3 > [ 1, loop.1 ]
-; CHECK-NEXT: < case4 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 2, loop.1.backedge ]
-; CHECK-NEXT: < case2 loop.1.backedge state.1.be2.si.unfold.false loop.1 loop.2 loop.3 > [ 4, loop.1.backedge ]
-; CHECK-NEXT: < case4 loop.2.backedge loop.2 loop.3 > [ 3, loop.2.backedge ]
-; CHECK-NEXT: < case3 loop.2.backedge loop.2 loop.3 > [ 0, loop.2.backedge ]
-; CHECK-NEXT: < case2 loop.3 > [ 3, loop.3 ]
+; CHECK: < loop.1.backedge, loop.1, loop.2, loop.3 > [ 1, loop.1 ]
+; CHECK-NEXT: < case4, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 2, loop.1.backedge ]
+; CHECK-NEXT: < case2, loop.1.backedge, state.1.be2.si.unfold.false, loop.1, loop.2, loop.3 > [ 4, loop.1.backedge ]
+; CHECK-NEXT: < case4, loop.2.backedge, loop.2, loop.3 > [ 3, loop.2.backedge ]
+; CHECK-NEXT: < case3, loop.2.backedge, loop.2, loop.3 > [ 0, loop.2.backedge ]
+; CHECK-NEXT: < case2, loop.3 > [ 3, loop.3 ]
 entry:
   %cmp = icmp eq i32 %init, 0
   br label %loop.1
diff --git a/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll b/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll
index 92747629..cb7c46e 100644
--- a/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/max-path-length.ll
@@ -9,9 +9,9 @@
 ; too long so that it is not jump-threaded.
 define i32 @max_path_length(i32 %num) {
 ; CHECK-NOT: 3, case1
-; CHECK: < case2 for.inc for.body > [ 1, for.inc ]
-; CHECK-NEXT: < for.inc for.body > [ 1, for.inc ]
-; CHECK-NEXT: < case2 sel.si.unfold.false for.inc for.body > [ 2, sel.si.unfold.false ]
+; CHECK: < case2, for.inc, for.body > [ 1, for.inc ]
+; CHECK-NEXT: < for.inc, for.body > [ 1, for.inc ]
+; CHECK-NEXT: < case2, sel.si.unfold.false, for.inc, for.body > [ 2, sel.si.unfold.false ]
 ; CHECK-NEXT: DFA-JT: Renaming non-local uses of: 
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll
index 0c922da..bbbc5c5 100644
--- a/llvm/test/Transforms/GVN/assume-equal.ll
+++ b/llvm/test/Transforms/GVN/assume-equal.ll
@@ -221,21 +221,22 @@ define i32 @_Z1ii(i32 %p) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[P]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    br i1 true, label %[[BB2:.*]], label %[[BB2]]
-; CHECK:       [[BB2]]:
-; CHECK-NEXT:    br i1 true, label %[[BB2]], label %[[BB2]]
-; CHECK:       [[BB0:.*:]]
+; CHECK-NEXT:    br i1 true, label %[[COMMON:.*]], label %[[COMMON]]
+; CHECK:       [[COMMON]]:
+; CHECK-NEXT:    br i1 true, label %[[COMMON]], label %[[COMMON]]
+; CHECK:       [[EXIT:.*:]]
 ; CHECK-NEXT:    ret i32 42
 ;
 entry:
   %cmp = icmp eq i32 %p, 42
   call void @llvm.assume(i1 %cmp)
 
-  br i1 %cmp, label %bb2, label %bb2
-bb2:
+  br i1 %cmp, label %common, label %common
+common:
   call void @llvm.assume(i1 true)
-  br i1 %cmp, label %bb2, label %bb2
+  br i1 %cmp, label %common, label %common
 
+exit:
   ret i32 %p
 }
 
@@ -357,8 +358,8 @@ define i8 @assume_ptr_eq_different_prov_matters(ptr %p, ptr %p2) {
   ret i8 %v
 }
 
-define i1 @assume_ptr_eq_different_prov_does_not_matter(ptr %p, ptr %p2) {
-; CHECK-LABEL: define i1 @assume_ptr_eq_different_prov_does_not_matter(
+define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp(ptr %p, ptr %p2) {
+; CHECK-LABEL: define i1 @assume_ptr_eq_different_prov_does_not_matter_icmp(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], [[P2]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
@@ -371,6 +372,36 @@ define i1 @assume_ptr_eq_different_prov_does_not_matter(ptr %p, ptr %p2) {
   ret i1 %c
 }
 
+; This is not correct, as it may change the provenance exposed by ptrtoint.
+; We still allow it for now.
+define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint(ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoint(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], [[P2]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[INT:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    ret i64 [[INT]]
+;
+  %cmp = icmp eq ptr %p, %p2
+  call void @llvm.assume(i1 %cmp)
+  %int = ptrtoint ptr %p2 to i64
+  ret i64 %int
+}
+
+define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @assume_ptr_eq_different_prov_does_not_matter_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], [[P2]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[INT:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    ret i64 [[INT]]
+;
+  %cmp = icmp eq ptr %p, %p2
+  call void @llvm.assume(i1 %cmp)
+  %int = ptrtoaddr ptr %p2 to i64
+  ret i64 %int
+}
+
 define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) {
 ; CHECK-LABEL: define i8 @assume_ptr_eq_same_prov(
 ; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 61b1331..5211fbd 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -1,6 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-target datalayout = "p1:64:64:64:32"
+
+; The ptrtoaddr folds are also valid for pointers that have external state.
+target datalayout = "pe1:64:64:64:32"
+
+@g = external global i8
+@g2 = external global i8
+
+@g.as1 = external addrspace(1) global i8
+@g2.as1 = external addrspace(1) global i8
 
 define i32 @ptrtoaddr_inttoptr_arg(i32 %a) {
 ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg(
@@ -24,14 +32,14 @@ define i32 @ptrtoaddr_inttoptr() {
 
 define i32 @ptrtoaddr_inttoptr_diff_size1() {
 ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-NEXT:    ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
+; CHECK-NEXT:    ret i32 -1
 ;
   ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
 }
 
 define i32 @ptrtoaddr_inttoptr_diff_size2() {
 ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-NEXT:    ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
+; CHECK-NEXT:    ret i32 65535
 ;
   ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
 }
@@ -52,14 +60,73 @@ define i64 @ptr2addr2_inttoptr_noas2() {
 
 define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
 ; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-NEXT:    ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
+; CHECK-NEXT:    ret i64 4294967295
 ;
   ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
 }
 
 define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
 ; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-NEXT:    ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
+; CHECK-NEXT:    ret i64 -1
 ;
   ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
 }
+
+define i64 @ptrtoaddr_gep_null() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
+; CHECK-NEXT:    ret i64 42
+;
+  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 42) to i64)
+}
+
+define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32)
+}
+
+define i64 @ptrtoaddr_gep_sub() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64)
+}
+
+define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-NEXT:    ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32)
+}
+
+; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
+; exposed provenance, which is not necessarily that of @g (especially as
+; ptrtoaddr does not expose the provenance.)
+define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-NEXT:    ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
+;
+  ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
+}
+
+define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+}
+
+define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-NEXT:    ret i64 42
+;
+  ret i64 sub (i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), i64 ptrtoaddr (ptr @g to i64))
+}
+
+define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32))
+}
diff --git a/llvm/test/Transforms/InstSimplify/ptr_diff.ll b/llvm/test/Transforms/InstSimplify/ptr_diff.ll
index d18b462..fdd9e8e 100644
--- a/llvm/test/Transforms/InstSimplify/ptr_diff.ll
+++ b/llvm/test/Transforms/InstSimplify/ptr_diff.ll
@@ -1,11 +1,9 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
 
-define i64 @ptrdiff1(ptr %ptr) {
-; CHECK-LABEL: @ptrdiff1(
-; CHECK:         ret i64 42
+define i64 @ptrdiff(ptr %ptr) {
+; CHECK-LABEL: @ptrdiff(
+; CHECK-NEXT:    ret i64 42
 ;
   %last = getelementptr inbounds i8, ptr %ptr, i32 42
   %first.int = ptrtoint ptr %ptr to i64
@@ -14,9 +12,24 @@ define i64 @ptrdiff1(ptr %ptr) {
   ret i64 %diff
 }
 
-define i64 @ptrdiff2(ptr %ptr) {
-; CHECK-LABEL: @ptrdiff2(
-; CHECK:         ret i64 42
+define i64 @ptrdiff_no_inbounds(ptr %ptr) {
+; CHECK-LABEL: @ptrdiff_no_inbounds(
+; CHECK-NEXT:    [[LAST:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 42
+; CHECK-NEXT:    [[FIRST_INT:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[LAST_INT:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[LAST_INT]], [[FIRST_INT]]
+; CHECK-NEXT:    ret i64 [[DIFF]]
+;
+  %last = getelementptr i8, ptr %ptr, i32 42
+  %first.int = ptrtoint ptr %ptr to i64
+  %last.int = ptrtoint ptr %last to i64
+  %diff = sub i64 %last.int, %first.int
+  ret i64 %diff
+}
+
+define i64 @ptrdiff_chain(ptr %ptr) {
+; CHECK-LABEL: @ptrdiff_chain(
+; CHECK-NEXT:    ret i64 42
 ;
   %first2 = getelementptr inbounds i8, ptr %ptr, i32 1
   %first3 = getelementptr inbounds i8, ptr %first2, i32 2
@@ -31,26 +44,10 @@ define i64 @ptrdiff2(ptr %ptr) {
   ret i64 %diff
 }
 
-define i64 @ptrdiff3(ptr %ptr) {
-; Don't bother with non-inbounds GEPs.
-; CHECK-LABEL: @ptrdiff3(
-; CHECK:         [[LAST:%.*]] = getelementptr i8, ptr %ptr, i32 42
-; CHECK-NEXT:    [[FIRST_INT:%.*]] = ptrtoint ptr %ptr to i64
-; CHECK-NEXT:    [[LAST_INT:%.*]] = ptrtoint ptr [[LAST]] to i64
-; CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[LAST_INT]], [[FIRST_INT]]
-; CHECK-NEXT:    ret i64 [[DIFF]]
-;
-  %last = getelementptr i8, ptr %ptr, i32 42
-  %first.int = ptrtoint ptr %ptr to i64
-  %last.int = ptrtoint ptr %last to i64
-  %diff = sub i64 %last.int, %first.int
-  ret i64 %diff
-}
-
-define <4 x i32> @ptrdiff4(<4 x ptr> %arg) nounwind {
 ; Handle simple cases of vectors of pointers.
-; CHECK-LABEL: @ptrdiff4(
-; CHECK:         ret <4 x i32> zeroinitializer
+define <4 x i32> @ptrdiff_vectors(<4 x ptr> %arg) nounwind {
+; CHECK-LABEL: @ptrdiff_vectors(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
 ;
   %p1 = ptrtoint <4 x ptr> %arg to <4 x i32>
   %bc = bitcast <4 x ptr> %arg to <4 x ptr>
@@ -63,9 +60,9 @@ define <4 x i32> @ptrdiff4(<4 x ptr> %arg) nounwind {
 
 @global = internal global %struct.ham zeroinitializer, align 4
 
-define i32 @ptrdiff5() nounwind {
-; CHECK-LABEL: @ptrdiff5(
-; CHECK:       bb:
+define i32 @ptrdiff_global() nounwind {
+; CHECK-LABEL: @ptrdiff_global(
+; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    ret i32 0
 ;
 bb:
diff --git a/llvm/test/Transforms/LICM/vector-intrinsics.ll b/llvm/test/Transforms/LICM/vector-intrinsics.ll
new file mode 100644
index 0000000..351773e
--- /dev/null
+++ b/llvm/test/Transforms/LICM/vector-intrinsics.ll
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='loop-mssa(licm)' -verify-memoryssa %s | FileCheck %s
+
+define i32 @reduce_umax(<2 x i32> %inv, i1 %c) {
+; CHECK-LABEL: define i32 @reduce_umax(
+; CHECK-SAME: <2 x i32> [[INV:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[REDUCE_UMAX:%.*]] = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> [[INV]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[BACKEDGE_COND:%.*]] = icmp ult i32 [[IV]], [[REDUCE_UMAX]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[C]], i1 [[BACKEDGE_COND]], i1 false
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %cond.true, label %exit
+
+cond.true:
+  %reduce.umax = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %inv)
+  %backedge.cond = icmp ult i32 %iv, %reduce.umax
+  br i1 %backedge.cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv
+}
+
+define i32 @vp_umax(<2 x i32> %inv.l, <2 x i32> %inv.r, i1 %c) {
+; CHECK-LABEL: define i32 @vp_umax(
+; CHECK-SAME: <2 x i32> [[INV_L:%.*]], <2 x i32> [[INV_R:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VP_UMAX:%.*]] = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> [[INV_L]], <2 x i32> [[INV_R]], <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x i32> [[VP_UMAX]], i32 0
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[BACKEDGE_COND:%.*]] = icmp ult i32 [[IV]], [[EXTRACT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[C]], i1 [[BACKEDGE_COND]], i1 false
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %cond.true, label %exit
+
+cond.true:
+  %vp.umax = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> %inv.l, <2 x i32> %inv.r, <2 x i1> splat (i1 1), i32 2)
+  %extract = extractelement <2 x i32> %vp.umax, i32 0
+  %backedge.cond = icmp ult i32 %iv, %extract
+  br i1 %backedge.cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv
+}
+
+define i32 @vp_udiv(<2 x i32> %inv.q, <2 x i32> %inv.d, i1 %c) {
+; CHECK-LABEL: define i32 @vp_udiv(
+; CHECK-SAME: <2 x i32> [[INV_Q:%.*]], <2 x i32> [[INV_D:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    [[VP_UDIV:%.*]] = call <2 x i32> @llvm.vp.udiv.v2i32(<2 x i32> [[INV_Q]], <2 x i32> [[INV_D]], <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x i32> [[VP_UDIV]], i32 0
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV]], [[EXTRACT]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %cond.true, label %exit
+
+cond.true:
+  %vp.udiv = call <2 x i32> @llvm.vp.udiv.v2i32(<2 x i32> %inv.q, <2 x i32> %inv.d, <2 x i1> splat (i1 1), i32 2)
+  %extract = extractelement <2 x i32> %vp.udiv, i32 0
+  %backedge.cond = icmp ult i32 %iv, %extract
+  br i1 %backedge.cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv
+}
+
+define i32 @vp_load(ptr %inv, i1 %c) {
+; CHECK-LABEL: define i32 @vp_load(
+; CHECK-SAME: ptr [[INV:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <2 x i32> @llvm.vp.load.v2i32.p0(ptr [[INV]], <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x i32> [[VP_LOAD]], i32 0
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV]], [[EXTRACT]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %cond.true, label %exit
+
+cond.true:
+  %vp.load = call <2 x i32> @llvm.vp.load.v2i32(ptr %inv, <2 x i1> splat (i1 1), i32 2)
+  %extract = extractelement <2 x i32> %vp.load, i32 0
+  %backedge.cond = icmp ult i32 %iv, %extract
+  br i1 %backedge.cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv
+}
+
+define i32 @vp_store(<2 x i32> %inv.v, ptr %inv.p, i1 %c) {
+; CHECK-LABEL: define i32 @vp_store(
+; CHECK-SAME: <2 x i32> [[INV_V:%.*]], ptr [[INV_P:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    call void @llvm.vp.store.v2i32.p0(<2 x i32> [[INV_V]], ptr [[INV_P]], <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[BACKEDGE_COND:%.*]] = icmp ult i32 [[IV]], 10
+; CHECK-NEXT:    br i1 [[BACKEDGE_COND]], label %[[LOOP]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %cond.true ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %cond.true, label %exit
+
+cond.true:
+  call void @llvm.vp.store.v2i32(<2 x i32> %inv.v, ptr %inv.p, <2 x i1> splat (i1 1), i32 2)
+  %backedge.cond = icmp ult i32 %iv, 10
+  br i1 %backedge.cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll
index d73900d..83b494a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/veclib-function-calls.ll
@@ -2288,7 +2288,7 @@ define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2]] = { "vector-function-abi-variant"="_ZGVrNxv_acos(Sleef_acosdx_u10rvvm2)" }
 ; CHECK: attributes #[[ATTR3]] = { "vector-function-abi-variant"="_ZGVrNxv_acosf(Sleef_acosfx_u10rvvm2)" }
 ; CHECK: attributes #[[ATTR4]] = { "vector-function-abi-variant"="_ZGVrNxv_acosh(Sleef_acoshdx_u10rvvm2)" }
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 3500c5c..4fd8d17 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -546,19 +546,50 @@ define i64 @loop_guards_needed_to_prove_deref_multiple(i32 %x, i1 %c, ptr derefe
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[PRE_2]])
 ; CHECK-NEXT:    [[N:%.*]] = add i32 [[SEL]], -1
 ; CHECK-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SEL]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[IV_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[LOOP_LATCH]] ], [ 0, [[PH]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER1:%.*]]
 ; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[PH]] ]
-; CHECK-NEXT:    [[GEP_SRC_I:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[LOOP_LATCH1:%.*]] ], [ [[IV]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_SRC_I:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV1]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I]], align 1
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH1]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV1]], [[N_EXT]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       exit.loopexit:
-; CHECK-NEXT:    [[RES_PH:%.*]] = phi i64 [ [[IV]], [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i64 [ [[IV1]], [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH1]] ], [ 0, [[LOOP_LATCH]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ -1, [[ENTRY:%.*]] ], [ -2, [[THEN]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ]
@@ -609,4 +640,6 @@ exit:
 ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
index 9acc6d6..09f583f 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
@@ -39,5 +39,4 @@ declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0
 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0
 
 ; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK-NEXT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
index da83c54..8348c97 100644
--- a/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
+++ b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
@@ -1,10 +1,12 @@
 REQUIRES: aarch64-registered-target
-// Flakey on SVE buildbots, disabled pending invesgitation.
+// This will sometimes fail with "Not all operands were initialized by the snippet generator for...".
 UNSUPPORTED: target={{.*}}
 
 RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%t.obj --opcode-name=FMOVWSr --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %t.obj > %t.s
 RUN: FileCheck %s < %t.s
 
+// Start matching after the printed file path, as that may contain something that looks like a mnemonic.
+CHECK: Disassembly of section .text:
 CHECK-NOT: ld{{[1-4]}}
 CHECK-NOT: st{{[1-4]}}
diff --git a/llvm/utils/lit/tests/xunit-output-report-failures-only.py b/llvm/utils/lit/tests/xunit-output-report-failures-only.py
index e15fd6a..c331578 100644
--- a/llvm/utils/lit/tests/xunit-output-report-failures-only.py
+++ b/llvm/utils/lit/tests/xunit-output-report-failures-only.py
@@ -5,7 +5,7 @@
 # CHECK:      <?xml version="1.0" encoding="UTF-8"?>
 # CHECK-NEXT: <testsuites time="{{[0-9.]+}}">
 # CHECK-NEXT: <testsuite name="test-data" tests="1" failures="1" skipped="0" time="{{[0-9.]+}}">
-# CHECK-NEXT: <testcase classname="test-data.test-data" name="bad&amp;name.ini" time="{{[0-1]\.[0-9]+}}">
+# CHECK-NEXT: <testcase classname="test-data.test-data" name="bad&amp;name.ini" time="{{[0-9.]+}}">
 # CHECK-NEXT:   <failure><![CDATA[& < > ]]]]><![CDATA[> &"]]></failure>
 # CHECK-NEXT: </testcase>
 # CHECK-NEXT: </testsuite>
diff --git a/mlir/include/mlir-c/Pass.h b/mlir/include/mlir-c/Pass.h
index 0d2e19e..1f63c6d 100644
--- a/mlir/include/mlir-c/Pass.h
+++ b/mlir/include/mlir-c/Pass.h
@@ -92,6 +92,18 @@ mlirPassManagerEnableVerifier(MlirPassManager passManager, bool enable);
 MLIR_CAPI_EXPORTED void
 mlirPassManagerEnableTiming(MlirPassManager passManager);
 
+/// Enumerated type of pass display modes.
+/// Mainly used in mlirPassManagerEnableStatistics.
+typedef enum {
+  MLIR_PASS_DISPLAY_MODE_LIST,
+  MLIR_PASS_DISPLAY_MODE_PIPELINE,
+} MlirPassDisplayMode;
+
+/// Enable pass statistics.
+MLIR_CAPI_EXPORTED void
+mlirPassManagerEnableStatistics(MlirPassManager passManager,
+                                MlirPassDisplayMode displayMode);
+
 /// Nest an OpPassManager under the top-level PassManager, the nested
 /// passmanager will only run on operations matching the provided name.
 /// The returned OpPassManager will be destroyed when the parent is destroyed.
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 5754db6..4c1db58 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -471,35 +471,33 @@ def EmitC_ExpressionOp
   let summary = "Expression operation";
   let description = [{
     The `emitc.expression` operation returns a single SSA value which is yielded by
-    its single-basic-block region. The operation doesn't take any arguments.
+    its single-basic-block region. The operation takes zero or more input operands 
+    that are passed as block arguments to the region.
 
     As the operation is to be emitted as a C expression, the operations within
     its body must form a single Def-Use tree, or a DAG trivially expandable to
     one, i.e. a DAG where each operation with side effects is only reachable
     once from the expression root.
 
-    Example:
+    Input operands can be of both value types (`EmitCType`) and lvalue types
+    (`EmitC_LValueType`).
 
+    Example:
     ```mlir
-    %r = emitc.expression : i32 {
-      %0 = emitc.add %a, %b : (i32, i32) -> i32
-      %1 = emitc.call_opaque "foo"(%0) : (i32) -> i32
-      %2 = emitc.add %c, %d : (i32, i32) -> i32
-      %3 = emitc.mul %1, %2 : (i32, i32) -> i32
-      emitc.yield %3 : i32
+    %r = emitc.expression %a, %b, %c : (i32, i32, i32) -> i32 {
+      %0 = emitc.call_opaque "foo"(%a) : (i32) -> i32
+      %1 = emitc.add %b, %c : (i32, i32) -> i32
+      %2 = emitc.mul %0, %1 : (i32, i32) -> i32
+      emitc.yield %2 : i32
     }
     ```
 
-    May be emitted as
-
+    May be emitted as:
     ```c++
-    int32_t v7 = foo(v1 + v2) * (v3 + v4);
+    int32_t v4 = foo(v1) * (v2 + v3);
     ```
 
-    The operations allowed within expression body are EmitC operations with the
-    CExpressionInterface interface.
-
-    When specified, the optional `do_not_inline` indicates that the expression is
+    When specified, the optional `noinline` indicates that the expression is
     to be emitted as seen above, i.e. as the rhs of an EmitC SSA value
     definition. Otherwise, the expression may be emitted inline, i.e. directly
     at its use.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index f8e3167..e2a0331 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -22,6 +22,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
 include "mlir/Dialect/LLVMIR/LLVMTypes.td"
+include "mlir/IR/CommonAttrConstraints.td"
 
 def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
@@ -1654,18 +1655,6 @@ def NVVM_ConvertFloatToTF32Op : NVVM_Op<"convert.float.to.tf32"> {
   }];
 }
 
-def ConvertFP6E2M3 : I32EnumAttrCase<"E2M3", 0, "e2m3">;
-def ConvertFP6E3M2 : I32EnumAttrCase<"E3M2", 1, "e3m2">;
-
-def ConvertFP6Type : I32EnumAttr<"ConvertFP6Type", "NVVM ConvertFP6Type kind",
-  [ConvertFP6E2M3, ConvertFP6E3M2]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def ConvertFP6TypeAttr : EnumAttr<NVVM_Dialect, ConvertFP6Type, "convert_fp6_type"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def NVVM_ConvertF32x2ToF6x2Op : NVVM_Op<"convert.f32x2.to.f6x2"> {
   let summary = "Convert a pair of float inputs to f6x2";
   let description = [{
@@ -1686,19 +1675,20 @@ def NVVM_ConvertF32x2ToF6x2Op : NVVM_Op<"convert.f32x2.to.f6x2"> {
 
   let results = (outs AnyTypeOf<[I16, VectorOfLengthAndType<[2], [I8]>]>:$dst);
   let arguments = (ins 
-    ConvertFP6TypeAttr:$type,
     F32:$a,
     F32:$b,
-    DefaultValuedAttr<BoolAttr, "false">:$relu);
-  let assemblyFormat = "$type $a `,` $b attr-dict `:` type($dst)";
+    DefaultValuedAttr<BoolAttr, "false">:$relu,
+    TypeAttr:$dstTy);
+  let assemblyFormat = "$a `,` $b attr-dict `:` type($dst) `(` $dstTy `)`";
+  let hasVerifier = 1;
   
   let extraClassDeclaration = [{
-    static llvm::Intrinsic::ID getIntrinsicID(NVVM::ConvertFP6Type,
+    static llvm::Intrinsic::ID getIntrinsicID(mlir::Type dstTy,
                                               bool hasRelu);
   }];
 
   string llvmBuilder = [{
-    auto intId = NVVM::ConvertF32x2ToF6x2Op::getIntrinsicID($type, $relu);
+    auto intId = NVVM::ConvertF32x2ToF6x2Op::getIntrinsicID($dstTy, $relu);
     llvm::Value *packedI16 = createIntrinsicCall(builder, intId, {$a, $b});
     if(op.getDst().getType().isInteger(16))
       $dst = packedI16;
@@ -1708,19 +1698,6 @@ def NVVM_ConvertF32x2ToF6x2Op : NVVM_Op<"convert.f32x2.to.f6x2"> {
   }];
 }
 
-def ConvertFP8E4M3 : I32EnumAttrCase<"E4M3", 0, "e4m3">;
-def ConvertFP8E5M2 : I32EnumAttrCase<"E5M2", 1, "e5m2">;
-def ConvertFP8UE8M0 : I32EnumAttrCase<"UE8M0", 2, "ue8m0">;
-
-def ConvertFP8Type : I32EnumAttr<"ConvertFP8Type", "NVVM ConvertFP8Type kind",
-  [ConvertFP8E4M3, ConvertFP8E5M2, ConvertFP8UE8M0]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def ConvertFP8TypeAttr : EnumAttr<NVVM_Dialect, ConvertFP8Type, "convert_fp8_type"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def NVVM_ConvertF32x2ToF8x2Op : NVVM_Op<"convert.f32x2.to.f8x2"> {
   let summary = "Convert a pair of float inputs to f8x2";
   let description = [{
@@ -1742,23 +1719,23 @@ def NVVM_ConvertF32x2ToF8x2Op : NVVM_Op<"convert.f32x2.to.f8x2"> {
   let hasVerifier = 1;
   let results = (outs AnyTypeOf<[I16, VectorOfLengthAndType<[2], [I8]>]>:$dst);
   let arguments = (ins
-    ConvertFP8TypeAttr:$type,
     F32:$a,
     F32:$b,
     DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::NONE">:$rnd,
     DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
-    DefaultValuedAttr<BoolAttr, "false">:$relu);
-  let assemblyFormat = "$type $a `,` $b attr-dict `:` type($dst)";
+    DefaultValuedAttr<BoolAttr, "false">:$relu,
+    TypeAttr:$dstTy);
+  let assemblyFormat = "$a `,` $b attr-dict `:` type($dst) `(` $dstTy `)`";
 
   let extraClassDeclaration = [{
-    static llvm::Intrinsic::ID getIntrinsicID(NVVM::ConvertFP8Type to,
+    static llvm::Intrinsic::ID getIntrinsicID(mlir::Type dstTy,
                                               NVVM::FPRoundingMode rnd,
                                               NVVM::SaturationMode sat,
                                               bool hasRelu);
   }];
   
   string llvmBuilder = [{
-    auto intId = NVVM::ConvertF32x2ToF8x2Op::getIntrinsicID($type, $rnd, $sat, $relu);
+    auto intId = NVVM::ConvertF32x2ToF8x2Op::getIntrinsicID($dstTy, $rnd, $sat, $relu);
     llvm::Value *packedI16 = createIntrinsicCall(builder, intId, {$a, $b});
     if(op.getDst().getType().isInteger(16))
       $dst = packedI16;
@@ -1790,18 +1767,18 @@ def NVVM_ConvertF16x2ToF8x2Op : NVVM_Op<"convert.f16x2.to.f8x2"> {
   let hasVerifier = 1;
   let results = (outs AnyTypeOf<[I16, VectorOfLengthAndType<[2], [I8]>]>:$dst);
   let arguments = (ins
-    ConvertFP8TypeAttr:$type,
     VectorOfLengthAndType<[2], [F16]>:$a,
-    DefaultValuedAttr<BoolAttr, "false">:$relu);
-  let assemblyFormat = "$type $a attr-dict `:` type($a) `->` type($dst)";
+    DefaultValuedAttr<BoolAttr, "false">:$relu,
+    TypeAttr:$dstTy);
+  let assemblyFormat = "$a attr-dict `:` type($a) `->` type($dst) `(` $dstTy `)`";
 
   let extraClassDeclaration = [{
-    static llvm::Intrinsic::ID getIntrinsicID(NVVM::ConvertFP8Type to,
+    static llvm::Intrinsic::ID getIntrinsicID(mlir::Type dstTy,
                                               bool hasRelu);
   }];
 
   string llvmBuilder = [{
-    auto intId = NVVM::ConvertF16x2ToF8x2Op::getIntrinsicID($type, $relu);
+    auto intId = NVVM::ConvertF16x2ToF8x2Op::getIntrinsicID($dstTy, $relu);
     llvm::Value *packedI16 = createIntrinsicCall(builder, intId, {$a});
     if(op.getDst().getType().isInteger(16))
       $dst = packedI16;
@@ -1833,11 +1810,11 @@ def NVVM_ConvertBF16x2ToF8x2Op : NVVM_Op<"convert.bf16x2.to.f8x2"> {
   let hasVerifier = 1;
   let results = (outs AnyTypeOf<[I16, VectorOfLengthAndType<[2], [I8]>]>:$dst);
   let arguments = (ins
-    ConvertFP8TypeAttr:$type,
     VectorOfLengthAndType<[2], [BF16]>:$a,
     DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::NONE">:$rnd,
-    DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat);
-  let assemblyFormat = "$type $a attr-dict `:` type($a) `->` type($dst)";
+    DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
+    TypeAttr:$dstTy);
+  let assemblyFormat = "$a attr-dict `:` type($a) `->` type($dst) `(` $dstTy `)`";
   
   let extraClassDeclaration = [{
     static llvm::Intrinsic::ID getIntrinsicID(NVVM::FPRoundingMode rnd,
diff --git a/mlir/lib/Bindings/Python/Globals.h b/mlir/lib/Bindings/Python/Globals.h
index 71a051c..1e81f53 100644
--- a/mlir/lib/Bindings/Python/Globals.h
+++ b/mlir/lib/Bindings/Python/Globals.h
@@ -17,6 +17,7 @@
 
 #include "NanobindUtils.h"
 #include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
 #include "mlir/CAPI/Support.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
@@ -151,6 +152,29 @@ public:
 
   TracebackLoc &getTracebackLoc() { return tracebackLoc; }
 
+  class TypeIDAllocator {
+  public:
+    TypeIDAllocator() : allocator(mlirTypeIDAllocatorCreate()) {}
+    ~TypeIDAllocator() {
+      if (allocator.ptr)
+        mlirTypeIDAllocatorDestroy(allocator);
+    }
+    TypeIDAllocator(const TypeIDAllocator &) = delete;
+    TypeIDAllocator(TypeIDAllocator &&other) : allocator(other.allocator) {
+      other.allocator.ptr = nullptr;
+    }
+
+    MlirTypeIDAllocator get() { return allocator; }
+    MlirTypeID allocate() {
+      return mlirTypeIDAllocatorAllocateTypeID(allocator);
+    }
+
+  private:
+    MlirTypeIDAllocator allocator;
+  };
+
+  MlirTypeID allocateTypeID() { return typeIDAllocator.allocate(); }
+
 private:
   static PyGlobals *instance;
 
@@ -173,6 +197,7 @@ private:
   llvm::StringSet<> loadedDialectModules;
 
   TracebackLoc tracebackLoc;
+  TypeIDAllocator typeIDAllocator;
 };
 
 } // namespace python
diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp
index e489585..572afa9 100644
--- a/mlir/lib/Bindings/Python/Pass.cpp
+++ b/mlir/lib/Bindings/Python/Pass.cpp
@@ -8,6 +8,7 @@
 
 #include "Pass.h"
 
+#include "Globals.h"
 #include "IRModule.h"
 #include "mlir-c/Pass.h"
 // clang-format off
@@ -57,6 +58,13 @@ private:
 /// Create the `mlir.passmanager` here.
 void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
   //----------------------------------------------------------------------------
+  // Mapping of enumerated types
+  //----------------------------------------------------------------------------
+  nb::enum_<MlirPassDisplayMode>(m, "PassDisplayMode")
+      .value("LIST", MLIR_PASS_DISPLAY_MODE_LIST)
+      .value("PIPELINE", MLIR_PASS_DISPLAY_MODE_PIPELINE);
+
+  //----------------------------------------------------------------------------
   // Mapping of MlirExternalPass
   //----------------------------------------------------------------------------
   nb::class_<MlirExternalPass>(m, "ExternalPass")
@@ -138,6 +146,14 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
             mlirPassManagerEnableTiming(passManager.get());
           },
           "Enable pass timing.")
+      .def(
+          "enable_statistics",
+          [](PyPassManager &passManager, MlirPassDisplayMode displayMode) {
+            mlirPassManagerEnableStatistics(passManager.get(), displayMode);
+          },
+          "displayMode"_a =
+              MlirPassDisplayMode::MLIR_PASS_DISPLAY_MODE_PIPELINE,
+          "Enable pass statistics.")
       .def_static(
           "parse",
           [](const std::string &pipeline, DefaultingPyMlirContext context) {
@@ -181,9 +197,7 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
               name = nb::cast<std::string>(
                   nb::borrow<nb::str>(run.attr("__name__")));
             }
-            MlirTypeIDAllocator typeIDAllocator = mlirTypeIDAllocatorCreate();
-            MlirTypeID passID =
-                mlirTypeIDAllocatorAllocateTypeID(typeIDAllocator);
+            MlirTypeID passID = PyGlobals::get().allocateTypeID();
             MlirExternalPassCallbacks callbacks;
             callbacks.construct = [](void *obj) {
               (void)nb::handle(static_cast<PyObject *>(obj)).inc_ref();
diff --git a/mlir/lib/CAPI/IR/Pass.cpp b/mlir/lib/CAPI/IR/Pass.cpp
index b0a6ec1..72bec11 100644
--- a/mlir/lib/CAPI/IR/Pass.cpp
+++ b/mlir/lib/CAPI/IR/Pass.cpp
@@ -13,6 +13,7 @@
 #include "mlir/CAPI/Support.h"
 #include "mlir/CAPI/Utils.h"
 #include "mlir/Pass/PassManager.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <optional>
 
 using namespace mlir;
@@ -79,6 +80,20 @@ void mlirPassManagerEnableTiming(MlirPassManager passManager) {
   unwrap(passManager)->enableTiming();
 }
 
+void mlirPassManagerEnableStatistics(MlirPassManager passManager,
+                                     MlirPassDisplayMode displayMode) {
+  PassDisplayMode mode;
+  switch (displayMode) {
+  case MLIR_PASS_DISPLAY_MODE_LIST:
+    mode = PassDisplayMode::List;
+    break;
+  case MLIR_PASS_DISPLAY_MODE_PIPELINE:
+    mode = PassDisplayMode::Pipeline;
+    break;
+  }
+  unwrap(passManager)->enableStatistics(mode);
+}
+
 MlirOpPassManager mlirPassManagerGetNestedUnder(MlirPassManager passManager,
                                                 MlirStringRef operationName) {
   return wrap(&unwrap(passManager)->nest(unwrap(operationName)));
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index e8f8824..7f419a0 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -216,6 +216,18 @@ LogicalResult ConvertFloatToTF32Op::verify() {
   return success();
 }
 
+LogicalResult ConvertF32x2ToF6x2Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float6E2M3FNType, mlir::Float6E3M2FNType>(getDstTy())) {
+    return emitOpError("Only ")
+           << mlir::Float6E2M3FNType::get(ctx) << " and "
+           << mlir::Float6E3M2FNType::get(ctx)
+           << " types are supported for conversions from f32x2 to f6x2.";
+  }
+  return success();
+}
+
 LogicalResult ConvertF32x2ToF8x2Op::verify() {
   using RndMode = NVVM::FPRoundingMode;
   using SatMode = NVVM::SaturationMode;
@@ -227,41 +239,67 @@ LogicalResult ConvertF32x2ToF8x2Op::verify() {
 
   bool hasRelu = getRelu();
 
-  switch (getType()) {
-  case ConvertFP8Type::E4M3:
-  case ConvertFP8Type::E5M2:
-    if (!isRoundingModeRN)
-      return emitOpError("Only RN rounding mode is supported for conversions "
-                         "from f32x2 to .e4m3x2 or .e5m2x2 types");
-    if (!isSatFinite)
-      return emitOpError("Only SATFINITE saturation mode is supported for "
-                         "conversions from f32x2 to .e4m3x2 or .e5m2x2 types");
-    break;
-  case ConvertFP8Type::UE8M0:
-    if (!(isRoundingModeRZ || isRoundingModeRP))
-      return emitOpError("Only RZ or RP rounding modes are supported for "
-                         "conversions from f32x2 to .ue8m0x2 type");
-    if (hasRelu)
-      return emitOpError("relu not supported for conversions to .ue8m0x2 type");
-    break;
-  }
-  return success();
+  mlir::MLIRContext *ctx = getContext();
+
+  return llvm::TypeSwitch<mlir::Type, LogicalResult>(getDstTy())
+      .Case<mlir::Float8E4M3FNType, mlir::Float8E5M2Type>(
+          [&](mlir::Type) -> LogicalResult {
+            if (!isRoundingModeRN) {
+              return emitOpError("Only RN rounding mode is supported for "
+                                 "conversions from f32x2 to ")
+                     << mlir::Float8E4M3FNType::get(ctx) << " and "
+                     << mlir::Float8E5M2Type::get(ctx) << " types";
+            }
+            if (!isSatFinite) {
+              return emitOpError("Only SATFINITE saturation mode is supported "
+                                 "for conversions "
+                                 "from f32x2 to ")
+                     << mlir::Float8E4M3FNType::get(ctx) << " and "
+                     << mlir::Float8E5M2Type::get(ctx) << " types";
+            }
+            return success();
+          })
+      .Case<mlir::Float8E8M0FNUType>([&](mlir::Type) -> LogicalResult {
+        if (!(isRoundingModeRZ || isRoundingModeRP)) {
+          return emitOpError("Only RZ and RP rounding modes are supported for "
+                             "conversions from f32x2 to ")
+                 << mlir::Float8E8M0FNUType::get(ctx) << " type";
+        }
+        if (hasRelu) {
+          return emitOpError("relu not supported for conversions to ")
+                 << mlir::Float8E8M0FNUType::get(ctx) << " type";
+        }
+        return success();
+      })
+      .Default([&](mlir::Type) {
+        return emitOpError("Only ")
+               << mlir::Float8E4M3FNType::get(ctx) << ", "
+               << mlir::Float8E5M2Type::get(ctx) << ", and "
+               << mlir::Float8E8M0FNUType::get(ctx)
+               << " types are "
+                  "supported for conversions from f32x2 to f8x2";
+      });
 }
 
 LogicalResult ConvertF16x2ToF8x2Op::verify() {
-  if (getType() == ConvertFP8Type::UE8M0)
-    return emitOpError("Only .e4m3 or .e5m2 types are supported for "
-                       "conversions from f16x2 to f8x2.");
+  mlir::MLIRContext *ctx = getContext();
 
+  if (!llvm::isa<mlir::Float8E4M3FNType, mlir::Float8E5M2Type>(getDstTy())) {
+    return emitOpError("Only ")
+           << mlir::Float8E4M3FNType::get(ctx) << " and "
+           << mlir::Float8E5M2Type::get(ctx)
+           << " types are supported for conversions from f16x2 to f8x2.";
+  }
   return success();
 }
 
 LogicalResult ConvertBF16x2ToF8x2Op::verify() {
   using RndMode = NVVM::FPRoundingMode;
 
-  if (getType() != ConvertFP8Type::UE8M0)
-    return emitOpError(
-        "Only .ue8m0 type is supported for conversions from bf16x2 to f8x2.");
+  if (!llvm::isa<mlir::Float8E8M0FNUType>(getDstTy()))
+    return emitOpError("Only ") << mlir::Float8E8M0FNUType::get(getContext())
+                                << " type is supported for conversions from "
+                                   "bf16x2 to f8x2.";
 
   auto rnd = getRnd();
   if (!(rnd == RndMode::RZ || rnd == RndMode::RP))
@@ -1980,15 +2018,19 @@ ConvertFloatToTF32Op::getIntrinsicID(NVVM::FPRoundingMode rnd,
   has_relu ? llvm::Intrinsic::nvvm_ff_to_##type##_rn_relu_satfinite            \
            : llvm::Intrinsic::nvvm_ff_to_##type##_rn_satfinite
 
-llvm::Intrinsic::ID
-ConvertF32x2ToF6x2Op::getIntrinsicID(NVVM::ConvertFP6Type type, bool hasRelu) {
-  switch (type) {
-  case NVVM::ConvertFP6Type::E2M3:
-    return GET_F32x2_TO_F6x2_ID(e2m3x2, hasRelu);
-  case NVVM::ConvertFP6Type::E3M2:
-    return GET_F32x2_TO_F6x2_ID(e3m2x2, hasRelu);
-  }
-  llvm_unreachable("Invalid conversion in ConvertF32x2ToF6x2Op");
+llvm::Intrinsic::ID ConvertF32x2ToF6x2Op::getIntrinsicID(mlir::Type dstTy,
+                                                         bool hasRelu) {
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float6E2M3FNType>([&](mlir::Float6E2M3FNType) {
+        return GET_F32x2_TO_F6x2_ID(e2m3x2, hasRelu);
+      })
+      .Case<mlir::Float6E3M2FNType>([&](mlir::Float6E3M2FNType) {
+        return GET_F32x2_TO_F6x2_ID(e3m2x2, hasRelu);
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid conversion in ConvertF32x2ToF6x2Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
 }
 
 #define GET_F32x2_TO_F8X2_US_ID(rnd, has_satf)                                 \
@@ -2000,41 +2042,50 @@ ConvertF32x2ToF6x2Op::getIntrinsicID(NVVM::ConvertFP6Type type, bool hasRelu) {
            : llvm::Intrinsic::nvvm_ff_to_##type##_rn
 
 llvm::Intrinsic::ID
-ConvertF32x2ToF8x2Op::getIntrinsicID(NVVM::ConvertFP8Type type,
-                                     NVVM::FPRoundingMode rnd,
+ConvertF32x2ToF8x2Op::getIntrinsicID(mlir::Type dstTy, NVVM::FPRoundingMode rnd,
                                      NVVM::SaturationMode sat, bool hasRelu) {
   bool hasSatFinite = (sat == NVVM::SaturationMode::SATFINITE);
   bool hasRoundingModeRZ = (rnd == NVVM::FPRoundingMode::RZ);
   bool hasRoundingModeRP = (rnd == NVVM::FPRoundingMode::RP);
 
-  switch (type) {
-  case NVVM::ConvertFP8Type::E4M3:
-    return GET_F32x2_TO_F8X2_S_ID(e4m3x2, hasRelu);
-  case NVVM::ConvertFP8Type::E5M2:
-    return GET_F32x2_TO_F8X2_S_ID(e5m2x2, hasRelu);
-  case NVVM::ConvertFP8Type::UE8M0:
-    if (hasRoundingModeRZ)
-      return GET_F32x2_TO_F8X2_US_ID(rz, hasSatFinite);
-    else if (hasRoundingModeRP)
-      return GET_F32x2_TO_F8X2_US_ID(rp, hasSatFinite);
-  }
-  llvm_unreachable("Invalid conversion in CvtFloatToF8x2Op");
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float8E4M3FNType>([&](mlir::Float8E4M3FNType) {
+        return GET_F32x2_TO_F8X2_S_ID(e4m3x2, hasRelu);
+      })
+      .Case<mlir::Float8E5M2Type>([&](mlir::Float8E5M2Type) {
+        return GET_F32x2_TO_F8X2_S_ID(e5m2x2, hasRelu);
+      })
+      .Case<mlir::Float8E8M0FNUType>([&](mlir::Float8E8M0FNUType) {
+        if (hasRoundingModeRZ)
+          return GET_F32x2_TO_F8X2_US_ID(rz, hasSatFinite);
+        else if (hasRoundingModeRP)
+          return GET_F32x2_TO_F8X2_US_ID(rp, hasSatFinite);
+
+        llvm_unreachable("Invalid conversion in ConvertF32x2ToF8x2Op");
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid conversion in ConvertF32x2ToF8x2Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
 }
 
 #define GET_F16x2_TO_F8X2_ID(type, has_relu)                                   \
   has_relu ? llvm::Intrinsic::nvvm_f16x2_to_##type##_rn_relu                   \
            : llvm::Intrinsic::nvvm_f16x2_to_##type##_rn
 
-llvm::Intrinsic::ID
-ConvertF16x2ToF8x2Op::getIntrinsicID(NVVM::ConvertFP8Type type, bool hasRelu) {
-  switch (type) {
-  case NVVM::ConvertFP8Type::E4M3:
-    return GET_F16x2_TO_F8X2_ID(e4m3x2, hasRelu);
-  case NVVM::ConvertFP8Type::E5M2:
-    return GET_F16x2_TO_F8X2_ID(e5m2x2, hasRelu);
-  default:
-    llvm_unreachable("Invalid ConvertFP8Type for CvtF16x2ToF8x2Op");
-  }
+llvm::Intrinsic::ID ConvertF16x2ToF8x2Op::getIntrinsicID(mlir::Type dstTy,
+                                                         bool hasRelu) {
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float8E4M3FNType>([&](mlir::Float8E4M3FNType) {
+        return GET_F16x2_TO_F8X2_ID(e4m3x2, hasRelu);
+      })
+      .Case<mlir::Float8E5M2Type>([&](mlir::Float8E5M2Type) {
+        return GET_F16x2_TO_F8X2_ID(e5m2x2, hasRelu);
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid conversion in ConvertF16x2ToF8x2Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
 }
 
 #define GET_BF16X2_TO_F8X2_ID(rnd, has_satf)                                   \
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
index 04163b5..9928992 100644
--- a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
@@ -3,9 +3,9 @@
 // CHECK-LABEL: @convert_f32x2_to_fp6x2_packed
 llvm.func @convert_f32x2_to_fp6x2_packed(%srcA : f32, %srcB : f32) {
   //CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f6x2 <e2m3> %srcA, %srcB : i16
+  %res1 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : i16 (f6E2M3FN)
   //CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f6x2 <e3m2> %srcA, %srcB : i16
+  %res2 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : i16 (f6E3M2FN)
   llvm.return
 }
 
@@ -13,9 +13,9 @@ llvm.func @convert_f32x2_to_fp6x2_packed(%srcA : f32, %srcB : f32) {
 llvm.func @convert_f32x2_to_fp6x2_vector(%srcA : f32, %srcB : f32) {
   //CHECK: %[[res0:.*]] = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float %{{.*}}, float %{{.*}})
   //CHECK-NEXT: %{{.*}} = bitcast i16 %[[res0]] to <2 x i8>
-  %res1 = nvvm.convert.f32x2.to.f6x2 <e2m3> %srcA, %srcB : vector<2xi8>
+  %res1 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : vector<2xi8> (f6E2M3FN)
   //CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float %{{.*}}, float %{{.*}})
   //CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res2 = nvvm.convert.f32x2.to.f6x2 <e3m2> %srcA, %srcB : vector<2xi8>
+  %res2 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : vector<2xi8> (f6E3M2FN)
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
index 4a15efb..de21826 100644
--- a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
@@ -5,31 +5,31 @@
 // CHECK-LABEL: @convert_f32x2_to_f8x2_e4m3
 llvm.func @convert_f32x2_to_f8x2_e4m3(%srcA : f32, %srcB : f32) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
   llvm.return
 }
 
 // CHECK-LABEL: @convert_f32x2_to_f8x2_e5m2
 llvm.func @convert_f32x2_to_f8x2_e5m2(%srcA : f32, %srcB : f32) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f8x2 <e5m2> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E5M2)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f8x2 <e5m2> %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E5M2)
   llvm.return
 }
 
 // CHECK-LABEL: @convert_f32x2_to_f8x2_ue8m0
 llvm.func @convert_f32x2_to_f8x2_ue8m0(%srcA : f32, %srcB : f32) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float %{{.*}}, float %{{.*}})
-  %res1 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>} : i16
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>} : i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float %{{.*}}, float %{{.*}})
-  %res2 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>} : i16
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>} : i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float %{{.*}}, float %{{.*}})
-  %res3 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res3 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float %{{.*}}, float %{{.*}})
-  %res4 = nvvm.convert.f32x2.to.f8x2 <ue8m0> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16
+  %res4 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E8M0FNU)
   llvm.return
 }
 
@@ -37,10 +37,10 @@ llvm.func @convert_f32x2_to_f8x2_ue8m0(%srcA : f32, %srcB : f32) {
 llvm.func @convert_f32x2_to_f8x2_vector_return(%srcA : f32, %srcB : f32) {
   // CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float %{{.*}}, float %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res1 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8>
+  %res1 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8> (f8E4M3FN)
   // CHECK: %[[res2:.*]] = call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float %{{.*}}, float %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res2]] to <2 x i8>
-  %res2 = nvvm.convert.f32x2.to.f8x2 <e4m3> %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8>
+  %res2 = nvvm.convert.f32x2.to.f8x2 %srcA, %srcB {relu = true, rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<satfinite>} : vector<2xi8> (f8E4M3FN)
   llvm.return
 }
 
@@ -49,18 +49,18 @@ llvm.func @convert_f32x2_to_f8x2_vector_return(%srcA : f32, %srcB : f32) {
 // CHECK-LABEL: @convert_f16x2_to_f8x2_e4m3
 llvm.func @convert_f16x2_to_f8x2_e4m3(%src : vector<2xf16>) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> %{{.*}})
-  %res1 = nvvm.convert.f16x2.to.f8x2 <e4m3> %src : vector<2xf16> -> i16
+  %res1 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> i16 (f8E4M3FN)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> %{{.*}})
-  %res2 = nvvm.convert.f16x2.to.f8x2 <e4m3> %src {relu = true} : vector<2xf16> -> i16
+  %res2 = nvvm.convert.f16x2.to.f8x2 %src {relu = true} : vector<2xf16> -> i16 (f8E4M3FN)
   llvm.return
 }
 
 // CHECK-LABEL: @convert_f16x2_to_f8x2_e5m2
 llvm.func @convert_f16x2_to_f8x2_e5m2(%src : vector<2xf16>) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> %{{.*}})
-  %res1 = nvvm.convert.f16x2.to.f8x2 <e5m2> %src : vector<2xf16> -> i16
+  %res1 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> i16 (f8E5M2)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> %{{.*}})
-  %res2 = nvvm.convert.f16x2.to.f8x2 <e5m2> %src {relu = true} : vector<2xf16> -> i16
+  %res2 = nvvm.convert.f16x2.to.f8x2 %src {relu = true} : vector<2xf16> -> i16 (f8E5M2)
   llvm.return
 }
 
@@ -68,10 +68,10 @@ llvm.func @convert_f16x2_to_f8x2_e5m2(%src : vector<2xf16>) {
 llvm.func @convert_f16x2_to_f8x2_vector_return(%src : vector<2xf16>) {
   // CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res1 = nvvm.convert.f16x2.to.f8x2 <e4m3> %src : vector<2xf16> -> vector<2xi8>
+  %res1 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> vector<2xi8> (f8E4M3FN)
   // CHECK: %[[res2:.*]] = call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res2]] to <2 x i8>
-  %res2 = nvvm.convert.f16x2.to.f8x2 <e5m2> %src : vector<2xf16> -> vector<2xi8>
+  %res2 = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> vector<2xi8> (f8E5M2)
   llvm.return
 }
 
@@ -80,13 +80,13 @@ llvm.func @convert_f16x2_to_f8x2_vector_return(%src : vector<2xf16>) {
 // CHECK-LABEL: @convert_bf16x2_to_f8x2_ue8m0
 llvm.func @convert_bf16x2_to_f8x2_ue8m0(%src : vector<2xbf16>) {
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> %{{.*}})
-  %res1 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16
+  %res1 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> %{{.*}})
-  %res2 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rp>} : vector<2xbf16> -> i16
+  %res2 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> %{{.*}})
-  %res3 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16
+  %res3 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   // CHECK: %{{.*}} = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> %{{.*}})
-  %res4 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16
+  %res4 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> i16 (f8E8M0FNU)
   llvm.return
 }
 
@@ -94,9 +94,9 @@ llvm.func @convert_bf16x2_to_f8x2_ue8m0(%src : vector<2xbf16>) {
 llvm.func @convert_bf16x2_to_f8x2_vector_return(%src : vector<2xbf16>) {
   // CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res1]] to <2 x i8>
-  %res1 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> vector<2xi8>
+  %res1 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> vector<2xi8> (f8E8M0FNU)
   // CHECK: %[[res2:.*]] = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> %{{.*}})
   // CHECK-NEXT: %{{.*}} = bitcast i16 %[[res2]] to <2 x i8>
-  %res2 = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> vector<2xi8>
+  %res2 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> vector<2xi8> (f8E8M0FNU)
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 383f482..0b36154 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -175,64 +175,64 @@ llvm.func @nvvm_match_sync_any(%val32: i32, %thread_mask: i32) {
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_rounding_e4m3(%a : f32, %b : f32) {
-  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e4m3> %a, %b {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16
+  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_rounding_e5m2(%a : f32, %b : f32) {
-  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e5m2> %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16
+  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E5M2)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_rounding_ue8m0(%a : f32, %b : f32) {
-  // expected-error @below {{Only RZ or RP rounding modes are supported for conversions from f32x2 to .ue8m0x2 type}}
-  %res = nvvm.convert.f32x2.to.f8x2 <ue8m0> %a, %b {rnd = #nvvm.fp_rnd_mode<rn>} : i16
+  // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from f32x2 to 'f8E8M0FNU' type}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rn>} : i16 (f8E8M0FNU)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_saturation_e4m3(%a : f32, %b : f32) {
-  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e4m3> %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16
+  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16 (f8E4M3FN)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_invalid_saturation_e5m2(%a : f32, %b : f32) {
-  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to .e4m3x2 or .e5m2x2 types}}
-  %res = nvvm.convert.f32x2.to.f8x2 <e5m2> %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16
+  // expected-error @below {{Only SATFINITE saturation mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<none>} : i16 (f8E5M2)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_float_to_f8x2_relu_not_supported_ue8m0(%a : f32, %b : f32) {
-  // expected-error @below {{relu not supported for conversions to .ue8m0x2 type}}
-  %res = nvvm.convert.f32x2.to.f8x2 <ue8m0> %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, relu = true} : i16
+  // expected-error @below {{relu not supported for conversions to 'f8E8M0FNU' type}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rp>, relu = true} : i16 (f8E8M0FNU)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_f16x2_to_f8x2_invalid_type(%src : vector<2xf16>) {
-  // expected-error @below {{Only .e4m3 or .e5m2 types are supported for conversions from f16x2 to f8x2.}}
-  %res = nvvm.convert.f16x2.to.f8x2 <ue8m0> %src : vector<2xf16> -> i16
+  // expected-error @below {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f16x2 to f8x2.}}
+  %res = nvvm.convert.f16x2.to.f8x2 %src : vector<2xf16> -> i16 (f8E8M0FNU)
   llvm.return
 }
 
 // -----
 
 llvm.func @nvvm_cvt_bf16x2_to_f8x2_invalid_type(%src : vector<2xbf16>) {
-  // expected-error @below {{Only .ue8m0 type is supported for conversions from bf16x2 to f8x2.}}
-  %res = nvvm.convert.bf16x2.to.f8x2 <e4m3> %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16
+  // expected-error @below {{Only 'f8E8M0FNU' type is supported for conversions from bf16x2 to f8x2.}}
+  %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> -> i16 (f8E4M3FN)
   llvm.return
 }
 
@@ -240,7 +240,15 @@ llvm.func @nvvm_cvt_bf16x2_to_f8x2_invalid_type(%src : vector<2xbf16>) {
 
 llvm.func @nvvm_cvt_bf16x2_to_f8x2_invalid_rounding(%src : vector<2xbf16>) {
   // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from bf16x2 to f8x2.}}
-  %res = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xbf16> -> i16
+  %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xbf16> -> i16 (f8E8M0FNU)
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_cvt_f32x2_to_f6x2_invalid_type(%a : f32, %b : f32) {
+  // expected-error @below {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f32x2 to f6x2.}}
+  %res = nvvm.convert.f32x2.to.f6x2 %a, %b : i16 (f8E8M0FNU)
   llvm.return
 }
 
diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py
index 5f92f5b..8e6208e 100644
--- a/mlir/test/python/pass_manager.py
+++ b/mlir/test/python/pass_manager.py
@@ -435,3 +435,23 @@ def testPrintIrTree():
 
             print_file_tree(temp_dir)
         log("// Tree printing end")
+
+
+# CHECK-LABEL: TEST: testEnableStatistics
+@run
+def testEnableStatistics():
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() {
+              %0 = arith.constant 10
+              return
+            }
+          }
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        pm.enable_statistics()
+        # CHECK: Pass statistics report
+        pm.run(module)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 8d2f975..a7723b8 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2696,22 +2696,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           Size / PatternSize);
   }
 
-  /// Initialize the async info for interoperability purposes.
+  /// Initialize the async info
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     // TODO: Implement this function.
     return Plugin::success();
   }
 
-  /// Initialize the device info for interoperability purposes.
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    DeviceInfo->Context = nullptr;
-
-    if (!DeviceInfo->Device)
-      DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
-
-    return Plugin::success();
-  }
-
   interop_spec_t selectInteropPreference(int32_t InteropType,
                                          int32_t NumPrefers,
                                          interop_spec_t *Prefers) override {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 5620437..8c530bb 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -951,14 +951,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
                      KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
 
-  /// Initialize a __tgt_async_info structure. Related to interop features.
+  /// Initialize a __tgt_async_info structure.
   Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
   virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
-  /// Initialize a __tgt_device_info structure. Related to interop features.
-  Error initDeviceInfo(__tgt_device_info *DeviceInfo);
-  virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
-
   /// Enqueue a host call to AsyncInfo
   Error enqueueHostCall(void (*Callback)(void *), void *UserData,
                         __tgt_async_info *AsyncInfo);
@@ -1490,10 +1486,6 @@ public:
   /// Creates an asynchronous queue for the given plugin.
   int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
 
-  /// Creates device information to be used for diagnostics.
-  int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
-                           const char **ErrStr);
-
   /// Sets the offset into the devices for use by OMPT.
   int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 15b6b98..db43cbe 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1524,12 +1524,6 @@ Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData,
   return Err;
 }
 
-Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
-  assert(DeviceInfo && "Invalid device info");
-
-  return initDeviceInfoImpl(DeviceInfo);
-}
-
 Error GenericDeviceTy::printInfo() {
   auto Info = obtainInfoImpl();
 
@@ -2128,21 +2122,6 @@ int32_t GenericPluginTy::init_async_info(int32_t DeviceId,
   return OFFLOAD_SUCCESS;
 }
 
-int32_t GenericPluginTy::init_device_info(int32_t DeviceId,
-                                          __tgt_device_info *DeviceInfo,
-                                          const char **ErrStr) {
-  *ErrStr = "";
-
-  auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo);
-  if (Err) {
-    REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
-           DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
 int32_t GenericPluginTy::set_device_identifier(int32_t UserId,
                                                int32_t DeviceId) {
   UserDeviceIds[DeviceId] = UserId;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index e5c4a1b..db94f7f 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -900,23 +900,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
-  /// Initialize the device info for interoperability purposes.
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    assert(Context && "Context is null");
-    assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
-
-    if (auto Err = setContext())
-      return Err;
-
-    if (!DeviceInfo->Context)
-      DeviceInfo->Context = Context;
-
-    if (!DeviceInfo->Device)
-      DeviceInfo->Device = reinterpret_cast<void *>(Device);
-
-    return Plugin::success();
-  }
-
   interop_spec_t selectInteropPreference(int32_t InteropType,
                                          int32_t NumPrefers,
                                          interop_spec_t *Prefers) override {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 0845032..eb4ecac 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -344,12 +344,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
                          "initAsyncInfoImpl not supported");
   }
 
-  /// This plugin does not support interoperability
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    return Plugin::error(ErrorCode::UNSUPPORTED,
-                         "initDeviceInfoImpl not supported");
-  }
-
   Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
                             AsyncInfoWrapperTy &AsyncInfo) override {
     Callback(UserData);
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e6a6efd..ea269da 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2305,6 +2305,7 @@ libc_support_library(
         ":__support_fputil_cast",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
+        ":__support_fputil_manipulation_functions",
         ":__support_fputil_sqrt",
         ":__support_macros_optimization",
         ":hdr_errno_macros",