237 files changed, 10425 insertions, 3653 deletions
diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile
index 9ddf501..f1e4f15 100644
--- a/.github/workflows/containers/github-action-ci-windows/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-windows/Dockerfile
@@ -98,3 +98,45 @@ RUN powershell -Command \
     Add-Type -AssemblyName System.IO.Compression.FileSystem ; \
     [System.IO.Compression.ZipFile]::ExtractToDirectory('actions-runner-win.zip', $PWD) ;\
     rm actions-runner-win.zip
+
+# Set the LLVM_VERSION environment variable
+ENV LLVM_VERSION=21.1.2
+
+# Download and extract Clang compiler.
+# Create directories, download, extract, and clean up all in one layer
+RUN powershell -Command \
+    # --- Setup directories --- \
+    Write-Host "Creating directories..."; \
+    New-Item -Path "C:\temp-download" -ItemType "Directory" -Force ; \
+    New-Item -Path "C:\xz-utils" -ItemType "Directory" -Force ; \
+    New-Item -Path "C:\clang" -ItemType "Directory" -Force ; \
+    # --- 1. Download and extract xz --- \
+    Set-Location C:\temp-download ; \
+    Invoke-WebRequest -Uri "http://github.com/tukaani-project/xz/releases/download/v5.8.1/xz-5.8.1-windows.zip" -OutFile "xz.zip"; \
+    (Get-FileHash -Path "C:\temp-download\xz.zip" -Algorithm MD5).Hash -eq 'c3c69fdce3e825cc0b76123b36b0bcc2' ; \
+    Add-Type -AssemblyName "System.IO.Compression.FileSystem"; \
+    [System.IO.Compression.ZipFile]::ExtractToDirectory('C:\temp-download\xz.zip', 'C:\xz-utils'); \ 
+    # --- 2. Download and decompress Clang --- \
+    Invoke-WebRequest -Uri "http://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.2/clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz" -OutFile "clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz" ; \
+    (Get-FileHash -Path "C:\temp-download\clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz" -Algorithm MD5).Hash -eq '0ae1d3effd9ab9d323f7fa595777f0a2' ; \
+    C:\xz-utils\bin_x86-64\xz.exe -d -qq clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz ; \
+    # --- 3. Extract clang --- \
+    C:\Windows\System32\tar.exe -xf clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar -C C:\clang ; \
+    # --- 4. Clean up --- \
+    Set-Location C:\ ; \
+    Remove-Item C:\temp-download -Recurse -Force; \
+    Remove-Item C:\xz-utils -Recurse -Force; \
+    # -- 5. Shorten path to clang files & remove unnecessary files -- \
+    Set-Location C:\clang ; \
+    Rename-Item -Path "C:\clang\clang+llvm-21.1.2-x86_64-pc-windows-msvc" -NewName "C:\clang\clang-msvc" ; \
+    Set-Location C:\clang\clang-msvc ; \
+    Remove-Item -Path C:\clang\clang-msvc\libexec -Recurse -Force ; \
+    Remove-Item -Path C:\clang\clang-msvc\share -Recurse -Force ; \
+    Rename-Item -Path "C:\clang\clang-msvc\bin" -NewName "C:\clang\clang-msvc\bin-full" ; \
+    New-Item -Path "C:\clang\clang-msvc\bin" -ItemType Directory -Force ; \
+    Set-Location C:\clang\clang-msvc\bin ; \
+    Copy-Item -Path C:\clang\clang-msvc\bin-full\*.dll -Destination C:\clang\clang-msvc\bin\. ; \
+    Copy-Item -Path C:\clang\clang-msvc\bin-full\clang-cl.exe -Destination C:\clang\clang-msvc\bin\. ; \
+    Copy-Item -Path C:\clang\clang-msvc\bin-full\lld-link.exe -Destination C:\clang\clang-msvc\bin\. ; \
+    Set-Location C:\clang\clang-msvc ; \
+    Remove-Item -Path C:\clang\clang-msvc\bin-full -Recurse -Force ;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 32c8f62..687cd46 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1648,6 +1648,9 @@ ASTContext::findPointerAuthContent(QualType T) const {
   if (!RD)
     return PointerAuthContent::None;
 
+  if (RD->isInvalidDecl())
+    return PointerAuthContent::None;
+
   if (auto Existing = RecordContainsAddressDiscriminatedPointerAuth.find(RD);
       Existing != RecordContainsAddressDiscriminatedPointerAuth.end())
     return Existing->second;
@@ -3517,7 +3520,6 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
 uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) {
   assert(!T->isDependentType() &&
          "cannot compute type discriminator of a dependent type");
-
   SmallString<256> Str;
   llvm::raw_svector_ostream Out(Str);
 
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 836d22f..f4ddbf4 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -3273,34 +3273,43 @@ bool Compiler<Emitter>::VisitCXXConstructExpr(const CXXConstructExpr *E) {
   }
 
   if (T->isArrayType()) {
-    const ConstantArrayType *CAT =
-        Ctx.getASTContext().getAsConstantArrayType(E->getType());
-    if (!CAT)
-      return false;
-
-    size_t NumElems = CAT->getZExtSize();
     const Function *Func = getFunction(E->getConstructor());
     if (!Func)
       return false;
 
-    // FIXME(perf): We're calling the constructor once per array element here,
-    //   in the old intepreter we had a special-case for trivial constructors.
-    for (size_t I = 0; I != NumElems; ++I) {
-      if (!this->emitConstUint64(I, E))
-        return false;
-      if (!this->emitArrayElemPtrUint64(E))
-        return false;
+    if (!this->emitDupPtr(E))
+      return false;
 
-      // Constructor arguments.
-      for (const auto *Arg : E->arguments()) {
-        if (!this->visit(Arg))
-          return false;
+    std::function<bool(QualType)> initArrayDimension;
+    initArrayDimension = [&](QualType T) -> bool {
+      if (!T->isArrayType()) {
+        // Constructor arguments.
+        for (const auto *Arg : E->arguments()) {
+          if (!this->visit(Arg))
+            return false;
+        }
+
+        return this->emitCall(Func, 0, E);
       }
 
-      if (!this->emitCall(Func, 0, E))
+      const ConstantArrayType *CAT =
+          Ctx.getASTContext().getAsConstantArrayType(T);
+      if (!CAT)
         return false;
-    }
-    return true;
+      QualType ElemTy = CAT->getElementType();
+      unsigned NumElems = CAT->getZExtSize();
+      for (size_t I = 0; I != NumElems; ++I) {
+        if (!this->emitConstUint64(I, E))
+          return false;
+        if (!this->emitArrayElemPtrUint64(E))
+          return false;
+        if (!initArrayDimension(ElemTy))
+          return false;
+      }
+      return this->emitPopPtr(E);
+    };
+
+    return initArrayDimension(E->getType());
   }
 
   return false;
@@ -3599,8 +3608,6 @@ bool Compiler<Emitter>::VisitCXXNewExpr(const CXXNewExpr *E) {
     if (PlacementDest) {
       if (!this->visit(PlacementDest))
         return false;
-      if (!this->emitStartLifetime(E))
-        return false;
       if (!this->emitGetLocal(SizeT, ArrayLen, E))
         return false;
       if (!this->emitCheckNewTypeMismatchArray(SizeT, E, E))
@@ -3740,10 +3747,9 @@ bool Compiler<Emitter>::VisitCXXNewExpr(const CXXNewExpr *E) {
     if (PlacementDest) {
       if (!this->visit(PlacementDest))
         return false;
-      if (!this->emitStartLifetime(E))
-        return false;
       if (!this->emitCheckNewTypeMismatch(E, E))
         return false;
+
     } else {
       // Allocate just one element.
       if (!this->emitAlloc(Desc, E))
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index a72282c..169a9a2 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1903,12 +1903,19 @@ bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E,
   if (Ptr.inUnion() && Ptr.getBase().getRecord()->isUnion())
     Ptr.activate();
 
+  if (Ptr.isZero()) {
+    S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_access_null)
+        << AK_Construct;
+    return false;
+  }
+
   if (!Ptr.isBlockPointer())
     return false;
 
+  startLifetimeRecurse(Ptr);
+
   // Similar to CheckStore(), but with the additional CheckTemporary() call and
   // the AccessKinds are different.
-
   if (!Ptr.block()->isAccessible()) {
     if (!CheckExtern(S, OpPC, Ptr))
       return false;
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 406feb5..1c17ad9e 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -866,19 +866,13 @@ def Free : Opcode {
   let Args = [ArgBool, ArgBool];
 }
 
-def CheckNewTypeMismatch : Opcode {
-  let Args = [ArgExpr];
-}
-
-def InvalidNewDeleteExpr : Opcode {
-  let Args = [ArgExpr];
-}
-
+def CheckNewTypeMismatch : Opcode { let Args = [ArgExpr]; }
 def CheckNewTypeMismatchArray : Opcode {
   let Types = [IntegerTypeClass];
   let Args = [ArgExpr];
   let HasGroup = 1;
 }
+def InvalidNewDeleteExpr : Opcode { let Args = [ArgExpr]; }
 
 def IsConstantContext: Opcode;
 def CheckAllocations : Opcode;
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 18641a9..146f058 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -398,6 +398,12 @@ void AArch64TargetInfo::getTargetDefinesARMV96A(const LangOptions &Opts,
   getTargetDefinesARMV95A(Opts, Builder);
 }
 
+void AArch64TargetInfo::getTargetDefinesARMV97A(const LangOptions &Opts,
+                                                MacroBuilder &Builder) const {
+  // Armv9.7-A does not have a v8.* equivalent, but is a superset of v9.6-A.
+  getTargetDefinesARMV96A(Opts, Builder);
+}
+
 void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
                                          MacroBuilder &Builder) const {
   // Target identification.
@@ -714,6 +720,8 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
     getTargetDefinesARMV95A(Opts, Builder);
   else if (*ArchInfo == llvm::AArch64::ARMV9_6A)
     getTargetDefinesARMV96A(Opts, Builder);
+  else if (*ArchInfo == llvm::AArch64::ARMV9_7A)
+    getTargetDefinesARMV97A(Opts, Builder);
 
   // All of the __sync_(bool|val)_compare_and_swap_(1|2|4|8|16) builtins work.
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
@@ -1152,6 +1160,9 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
     if (Feature == "+v9.6a" &&
         ArchInfo->Version < llvm::AArch64::ARMV9_6A.Version)
       ArchInfo = &llvm::AArch64::ARMV9_6A;
+    if (Feature == "+v9.7a" &&
+        ArchInfo->Version < llvm::AArch64::ARMV9_7A.Version)
+      ArchInfo = &llvm::AArch64::ARMV9_7A;
     if (Feature == "+v8r")
       ArchInfo = &llvm::AArch64::ARMV8R;
     if (Feature == "+fullfp16") {
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index dfd89be..3952e7b 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -190,6 +190,8 @@ public:
                                MacroBuilder &Builder) const;
   void getTargetDefinesARMV96A(const LangOptions &Opts,
                                MacroBuilder &Builder) const;
+  void getTargetDefinesARMV97A(const LangOptions &Opts,
+                               MacroBuilder &Builder) const;
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override;
 
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index d00a3a4..394b50b 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -231,6 +231,8 @@ StringRef ARMTargetInfo::getCPUAttr() const {
     return "9_5A";
   case llvm::ARM::ArchKind::ARMV9_6A:
     return "9_6A";
+  case llvm::ARM::ArchKind::ARMV9_7A:
+    return "9_7A";
   case llvm::ARM::ArchKind::ARMV8MBaseline:
     return "8M_BASE";
   case llvm::ARM::ArchKind::ARMV8MMainline:
@@ -904,6 +906,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV9_4A:
   case llvm::ARM::ArchKind::ARMV9_5A:
   case llvm::ARM::ArchKind::ARMV9_6A:
+  case llvm::ARM::ArchKind::ARMV9_7A:
     // Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h
     FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3;
     break;
@@ -1074,6 +1077,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV9_4A:
   case llvm::ARM::ArchKind::ARMV9_5A:
   case llvm::ARM::ArchKind::ARMV9_6A:
+  case llvm::ARM::ArchKind::ARMV9_7A:
     getTargetDefinesARMV83A(Opts, Builder);
     break;
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
index 033eb8c..f60d193 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
@@ -50,7 +50,9 @@ public:
       llvm::DenseSet<const DeclRefExpr *> DeclRefExprsToIgnore;
       llvm::DenseSet<const LambdaExpr *> LambdasToIgnore;
       llvm::DenseSet<const ValueDecl *> ProtectedThisDecls;
+      llvm::DenseSet<const CallExpr *> CallToIgnore;
       llvm::DenseSet<const CXXConstructExpr *> ConstructToIgnore;
+      llvm::DenseMap<const VarDecl *, const LambdaExpr *> LambdaOwnerMap;
 
       QualType ClsType;
 
@@ -101,10 +103,60 @@ public:
         auto *Init = VD->getInit();
         if (!Init)
           return true;
-        auto *L = dyn_cast_or_null<LambdaExpr>(Init->IgnoreParenCasts());
-        if (!L)
+        if (auto *L = dyn_cast_or_null<LambdaExpr>(Init->IgnoreParenCasts())) {
+          LambdasToIgnore.insert(L); // Evaluate lambdas in VisitDeclRefExpr.
+          return true;
+        }
+        if (!VD->hasLocalStorage())
           return true;
-        LambdasToIgnore.insert(L); // Evaluate lambdas in VisitDeclRefExpr.
+        if (auto *E = dyn_cast<ExprWithCleanups>(Init))
+          Init = E->getSubExpr();
+        if (auto *E = dyn_cast<CXXBindTemporaryExpr>(Init))
+          Init = E->getSubExpr();
+        if (auto *CE = dyn_cast<CallExpr>(Init)) {
+          if (auto *Callee = CE->getDirectCallee()) {
+            auto FnName = safeGetName(Callee);
+            unsigned ArgCnt = CE->getNumArgs();
+            if (FnName == "makeScopeExit" && ArgCnt == 1) {
+              auto *Arg = CE->getArg(0);
+              if (auto *E = dyn_cast<MaterializeTemporaryExpr>(Arg))
+                Arg = E->getSubExpr();
+              if (auto *L = dyn_cast<LambdaExpr>(Arg)) {
+                LambdaOwnerMap.insert(std::make_pair(VD, L));
+                CallToIgnore.insert(CE);
+                LambdasToIgnore.insert(L);
+              }
+            } else if (FnName == "makeVisitor") {
+              for (unsigned ArgIndex = 0; ArgIndex < ArgCnt; ++ArgIndex) {
+                auto *Arg = CE->getArg(ArgIndex);
+                if (auto *E = dyn_cast<MaterializeTemporaryExpr>(Arg))
+                  Arg = E->getSubExpr();
+                if (auto *L = dyn_cast<LambdaExpr>(Arg)) {
+                  LambdaOwnerMap.insert(std::make_pair(VD, L));
+                  CallToIgnore.insert(CE);
+                  LambdasToIgnore.insert(L);
+                }
+              }
+            }
+          }
+        } else if (auto *CE = dyn_cast<CXXConstructExpr>(Init)) {
+          if (auto *Ctor = CE->getConstructor()) {
+            if (auto *Cls = Ctor->getParent()) {
+              auto FnName = safeGetName(Cls);
+              unsigned ArgCnt = CE->getNumArgs();
+              if (FnName == "ScopeExit" && ArgCnt == 1) {
+                auto *Arg = CE->getArg(0);
+                if (auto *E = dyn_cast<MaterializeTemporaryExpr>(Arg))
+                  Arg = E->getSubExpr();
+                if (auto *L = dyn_cast<LambdaExpr>(Arg)) {
+                  LambdaOwnerMap.insert(std::make_pair(VD, L));
+                  ConstructToIgnore.insert(CE);
+                  LambdasToIgnore.insert(L);
+                }
+              }
+            }
+          }
+        }
         return true;
       }
 
@@ -114,6 +166,12 @@ public:
         auto *VD = dyn_cast_or_null<VarDecl>(DRE->getDecl());
         if (!VD)
           return true;
+        if (auto It = LambdaOwnerMap.find(VD); It != LambdaOwnerMap.end()) {
+          auto *L = It->second;
+          Checker->visitLambdaExpr(L, shouldCheckThis() && !hasProtectedThis(L),
+                                   ClsType);
+          return true;
+        }
         auto *Init = VD->getInit();
         if (!Init)
           return true;
@@ -167,10 +225,14 @@ public:
       }
 
       bool VisitCallExpr(CallExpr *CE) override {
+        if (CallToIgnore.contains(CE))
+          return true;
         checkCalleeLambda(CE);
-        if (auto *Callee = CE->getDirectCallee())
+        if (auto *Callee = CE->getDirectCallee()) {
+          if (isVisitFunction(CE, Callee))
+            return true;
           checkParameters(CE, Callee);
-        else if (auto *CalleeE = CE->getCallee()) {
+        } else if (auto *CalleeE = CE->getCallee()) {
           if (auto *DRE = dyn_cast<DeclRefExpr>(CalleeE->IgnoreParenCasts())) {
             if (auto *Callee = dyn_cast_or_null<FunctionDecl>(DRE->getDecl()))
               checkParameters(CE, Callee);
@@ -179,6 +241,34 @@ public:
         return true;
       }
 
+      bool isVisitFunction(CallExpr *CallExpr, FunctionDecl *FnDecl) {
+        bool IsVisitFn = safeGetName(FnDecl) == "visit";
+        if (!IsVisitFn)
+          return false;
+        bool ArgCnt = CallExpr->getNumArgs();
+        if (!ArgCnt)
+          return false;
+        auto *Ns = FnDecl->getParent();
+        if (!Ns)
+          return false;
+        auto NsName = safeGetName(Ns);
+        if (NsName != "WTF" && NsName != "std")
+          return false;
+        auto *Arg = CallExpr->getArg(0);
+        if (!Arg)
+          return false;
+        auto *DRE = dyn_cast<DeclRefExpr>(Arg->IgnoreParenCasts());
+        if (!DRE)
+          return false;
+        auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
+        if (!VD)
+          return false;
+        if (!LambdaOwnerMap.contains(VD))
+          return false;
+        DeclRefExprsToIgnore.insert(DRE);
+        return true;
+      }
+
       void checkParameters(CallExpr *CE, FunctionDecl *Callee) {
         unsigned ArgIndex = isa<CXXOperatorCallExpr>(CE);
         bool TreatAllArgsAsNoEscape = shouldTreatAllArgAsNoEscape(Callee);
@@ -280,7 +370,7 @@ public:
         LambdasToIgnore.insert(L);
       }
 
-      bool hasProtectedThis(LambdaExpr *L) {
+      bool hasProtectedThis(const LambdaExpr *L) {
         for (const LambdaCapture &OtherCapture : L->captures()) {
           if (!OtherCapture.capturesVariable())
             continue;
@@ -378,7 +468,8 @@ public:
     visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
   }
 
-  void visitLambdaExpr(LambdaExpr *L, bool shouldCheckThis, const QualType T,
+  void visitLambdaExpr(const LambdaExpr *L, bool shouldCheckThis,
+                       const QualType T,
                        bool ignoreParamVarDecl = false) const {
     if (TFA.isTrivial(L->getBody()))
       return;
@@ -410,7 +501,7 @@ public:
   }
 
   void reportBug(const LambdaCapture &Capture, ValueDecl *CapturedVar,
-                 const QualType T, LambdaExpr *L) const {
+                 const QualType T, const LambdaExpr *L) const {
     assert(CapturedVar);
 
     auto Location = Capture.getLocation();
diff --git a/clang/test/AST/ByteCode/arrays.cpp b/clang/test/AST/ByteCode/arrays.cpp
index 22a4b41..eaf9559 100644
--- a/clang/test/AST/ByteCode/arrays.cpp
+++ b/clang/test/AST/ByteCode/arrays.cpp
@@ -820,3 +820,14 @@ namespace FAM {
     return 1;
   }
 }
+
+namespace MultiDimConstructExpr {
+  struct a {
+    a *p = this;
+  };
+  struct b {
+    a m[3][3];
+  };
+  constexpr b d;
+  static_assert(d.m[2][1].p == &d.m[2][1]);
+}
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index b587cd6..1c015da 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -494,3 +494,32 @@ constexpr int modify_const_variable() {
 }
 static_assert(modify_const_variable()); // both-error {{not an integral constant expression}} \
                                         // both-note {{in call to}}
+
+constexpr int nullDest() {
+  new (nullptr) int{12}; // both-note {{construction of dereferenced null pointer}}
+  return 0;
+}
+static_assert(nullDest() == 0); // both-error {{not an integral constant expression}} \
+                                // both-note {{in call to}}
+
+constexpr int nullArrayDest() {
+  new (nullptr) int{12}; // both-note {{construction of dereferenced null pointer}}
+  return 0;
+}
+static_assert(nullArrayDest() == 0); // both-error {{not an integral constant expression}} \
+                                     // both-note {{in call to}}
+
+constexpr int intDest() {
+  new ((void*)2) int{3}; // both-note {{cast that performs the conversions of a reinterpret_cast}}
+  return 0;
+}
+static_assert(intDest() == 0); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to}}
+
+constexpr int intDestArray() {
+  new ((void*)2) int[4]; // both-note {{cast that performs the conversions of a reinterpret_cast}}
+  return 0;
+}
+static_assert(intDestArray() == 0); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to}}
+
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
index a4ad741..fd1eecd 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
@@ -107,6 +107,79 @@ private:
   ValueType* m_table { nullptr };
 };
 
+class ScopeExit final {
+public:
+  template<typename ExitFunctionParameter>
+  explicit ScopeExit(ExitFunctionParameter&& exitFunction)
+    : m_exitFunction(std::move(exitFunction)) {
+  }
+
+  ScopeExit(ScopeExit&& other)
+    : m_exitFunction(std::move(other.m_exitFunction))
+    , m_execute(std::move(other.m_execute)) {
+  }
+
+  ~ScopeExit() {
+    if (m_execute)
+      m_exitFunction();
+  }
+
+  WTF::Function<void()> take() {
+    m_execute = false;
+    return std::move(m_exitFunction);
+  }
+
+  void release() { m_execute = false; }
+
+  ScopeExit(const ScopeExit&) = delete;
+  ScopeExit& operator=(const ScopeExit&) = delete;
+  ScopeExit& operator=(ScopeExit&&) = delete;
+
+private:
+  WTF::Function<void()> m_exitFunction;
+  bool m_execute { true };
+};
+
+template<typename ExitFunction> ScopeExit makeScopeExit(ExitFunction&&);
+template<typename ExitFunction>
+ScopeExit makeScopeExit(ExitFunction&& exitFunction)
+{
+    return ScopeExit(std::move(exitFunction));
+}
+
+// Visitor adapted from http://stackoverflow.com/questions/25338795/is-there-a-name-for-this-tuple-creation-idiom
+
+template<class A, class... B> struct Visitor : Visitor<A>, Visitor<B...> {
+    Visitor(A a, B... b)
+        : Visitor<A>(a)
+        , Visitor<B...>(b...)
+    {
+    }
+
+    using Visitor<A>::operator ();
+    using Visitor<B...>::operator ();
+};
+  
+template<class A> struct Visitor<A> : A {
+    Visitor(A a)
+        : A(a)
+    {
+    }
+
+    using A::operator();
+};
+ 
+template<class... F> Visitor<F...> makeVisitor(F... f)
+{
+    return Visitor<F...>(f...);
+}
+
+void opaqueFunction();
+template <typename Visitor, typename... Variants> void visit(Visitor&& v, Variants&&... values)
+{
+  opaqueFunction();
+}
+
 } // namespace WTF
 
 struct A {
@@ -501,3 +574,81 @@ void RefCountedObj::call() const
     };
     callLambda(lambda);
 }
+
+void scope_exit(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+  });
+  someFunction();
+  WTF::ScopeExit scope2([&] {
+    obj->method();
+  });
+  someFunction();
+}
+
+void doWhateverWith(WTF::ScopeExit& obj);
+
+void scope_exit_with_side_effect(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  doWhateverWith(scope);
+}
+
+void scope_exit_static(RefCountable* obj) {
+  static auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+}
+
+WTF::Function<void()> scope_exit_take_lambda(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  return scope.take();
+}
+
+// FIXME: Ideally, we treat release() as a trivial function.
+void scope_exit_release(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  scope.release();
+}
+
+void make_visitor(RefCountable* obj) {
+  auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+  });
+}
+
+void use_visitor(RefCountable* obj) {
+  auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+  });
+  WTF::visit(visitor, obj);
+}
+
+template <typename Visitor, typename ObjectType>
+void bad_visit(Visitor&, ObjectType*) {
+  someFunction();
+}
+
+void static_visitor(RefCountable* obj) {
+  static auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+}
+
+void bad_use_visitor(RefCountable* obj) {
+  auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  bad_visit(visitor, obj);
+}
diff --git a/clang/test/CodeGen/arm-acle-coproc.c b/clang/test/CodeGen/arm-acle-coproc.c
index 93b713b..5acb9f6 100644
--- a/clang/test/CodeGen/arm-acle-coproc.c
+++ b/clang/test/CodeGen/arm-acle-coproc.c
@@ -26,6 +26,7 @@
 // RUN: %clang_cc1 -triple armv9.4a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv9.5a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv9.6a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
+// RUN: %clang_cc1 -triple armv9.7a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv4 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V4-THUMB %s
 // RUN: %clang_cc1 -triple thumbv4t %s -E -dD -o - | FileCheck --check-prefix=CHECK-V4-THUMB %s
 // RUN: %clang_cc1 -triple thumbv5 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-THUMB %s
@@ -56,6 +57,7 @@
 // RUN: %clang_cc1 -triple thumbv9.4a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv9.5a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv9.6a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
+// RUN: %clang_cc1 -triple thumbv9.7a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv8m.base %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-BASE %s
 // RUN: %clang_cc1 -triple thumbv8m.main %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-MAIN %s
 // RUN: %clang_cc1 -triple thumbv8.1m.main %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-MAIN %s
diff --git a/clang/test/Driver/aarch64-v96a.c b/clang/test/Driver/aarch64-v96a.c
index de78901..e0081bb 100644
--- a/clang/test/Driver/aarch64-v96a.c
+++ b/clang/test/Driver/aarch64-v96a.c
@@ -6,7 +6,7 @@
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
-// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2"
+// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}}
 
 // RUN: %clang -target aarch64_be -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
@@ -14,7 +14,7 @@
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
-// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2"
+// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}}
 
 // ===== Features supported on aarch64 =====
 
diff --git a/clang/test/Driver/aarch64-v97a.c b/clang/test/Driver/aarch64-v97a.c
new file mode 100644
index 0000000..1e54e6b
--- /dev/null
+++ b/clang/test/Driver/aarch64-v97a.c
@@ -0,0 +1,59 @@
+// ===== Base v9.7a architecture =====
+
+// RUN: %clang -target aarch64 -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64 -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64 -mlittle-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64 -mlittle-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// GENERICV97A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p3"{{.*}}
+
+// RUN: %clang -target aarch64_be -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64_be -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// GENERICV97A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p3"{{.*}}
+
+// ===== Features supported on aarch64 =====
+
+// RUN: %clang -target aarch64 -march=armv9.7a+sme2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SME2p3 %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+sme2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SME2p3 %s
+// V97A-SME2p3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+sme2p3"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+sve2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE2p3 %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+sve2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE2p3 %s
+// V97A-SVE2p3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+sve2p3"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+sve-b16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE-B16MM %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+sve-b16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE-B16MM %s
+// V97A-SVE-B16MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+sve-b16mm"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-F16MM %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-F16MM %s
+// V97A-F16MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+f16mm"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+cmh -### -c %s 2>&1 | FileCheck -check-prefix=V97A-CMH %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+cmh -### -c %s 2>&1 | FileCheck -check-prefix=V97A-CMH %s
+// V97A-CMH: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+cmh"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+lscp -### -c %s 2>&1 | FileCheck -check-prefix=V97A-LSCP %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+lscp -### -c %s 2>&1 | FileCheck -check-prefix=V97A-LSCP %s
+// V97A-LSCP: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+lscp"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+tlbid -### -c %s 2>&1 | FileCheck -check-prefix=V97A-TLBID %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+tlbid -### -c %s 2>&1 | FileCheck -check-prefix=V97A-TLBID %s
+// V97A-TLBID: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+tlbid"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+mpamv2 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MPAMv2 %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+mpamv2 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MPAMv2 %s
+// V97A-MPAMv2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mpamv2"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+mtetc -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MTETC %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+mtetc -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MTETC %s
+// V97A-MTETC: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mtetc"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+gcie -### -c %s 2>&1 | FileCheck -check-prefix=V97A-GCIE %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+gcie -### -c %s 2>&1 | FileCheck -check-prefix=V97A-GCIE %s
+// V97A-GCIE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+gcie"
diff --git a/clang/test/Driver/arm-cortex-cpus-1.c b/clang/test/Driver/arm-cortex-cpus-1.c
index 5d3169f..08cb63b1 100644
--- a/clang/test/Driver/arm-cortex-cpus-1.c
+++ b/clang/test/Driver/arm-cortex-cpus-1.c
@@ -512,3 +512,20 @@
 // RUN: %clang -target arm -march=armebv9.6a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V96A %s
 // RUN: %clang -target arm -march=armebv9.6-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V96A %s
 // CHECK-BE-V96A: "-cc1"{{.*}} "-triple" "armebv9.6{{.*}}" "-target-cpu" "generic"
+//
+// RUN: %clang -target armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target armv9.7a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -mlittle-endian -march=armv9.7-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// CHECK-V97A: "-cc1"{{.*}} "-triple" "armv9.7{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target armv9.7a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target armeb -march=armebv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target armeb -march=armebv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target arm -march=armebv9.7a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target arm -march=armebv9.7-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// CHECK-BE-V97A: "-cc1"{{.*}} "-triple" "armebv9.7{{.*}}" "-target-cpu" "generic"
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c
index 5c840a6..7294c33 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -8,6 +8,7 @@
 // CHECK-NEXT:     bf16                FEAT_BF16                                              Enable BFloat16 Extension
 // CHECK-NEXT:     brbe                FEAT_BRBE                                              Enable Branch Record Buffer Extension
 // CHECK-NEXT:     bti                 FEAT_BTI                                               Enable Branch Target Identification
+// CHECK-NEXT:     cmh                 FEAT_CMH                                               Enable Armv9.7-A Contention Management Hints
 // CHECK-NEXT:     cmpbr               FEAT_CMPBR                                             Enable Armv9.6-A base compare and branch instructions
 // CHECK-NEXT:     fcma                FEAT_FCMA                                              Enable Armv8.3-A Floating-point complex number support
 // CHECK-NEXT:     cpa                 FEAT_CPA                                               Enable Armv9.5-A Checked Pointer Arithmetic
@@ -17,6 +18,9 @@
 // CHECK-NEXT:     d128                FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128 Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers and instructions
 // CHECK-NEXT:     dit                 FEAT_DIT                                               Enable Armv8.4-A Data Independent Timing instructions
 // CHECK-NEXT:     dotprod             FEAT_DotProd                                           Enable dot product support
+// CHECK-NEXT:     f16f32dot           FEAT_F16F32DOT                                         Enable Armv9.7-A Advanced SIMD half-precision dot product accumulate to single-precision
+// CHECK-NEXT:     f16f32mm            FEAT_F16F32MM                                          Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision
+// CHECK-NEXT:     f16mm               FEAT_F16MM                                             Enable Armv9.7-A non-widening half-precision matrix multiply-accumulate
 // CHECK-NEXT:     f32mm               FEAT_F32MM                                             Enable Matrix Multiply FP32 Extension
 // CHECK-NEXT:     f64mm               FEAT_F64MM                                             Enable Matrix Multiply FP64 Extension
 // CHECK-NEXT:     f8f16mm             FEAT_F8F16MM                                           Enable Armv9.6-A FP8 to Half-Precision Matrix Multiplication
@@ -31,19 +35,23 @@
 // CHECK-NEXT:     fp8fma              FEAT_FP8FMA                                            Enable Armv9.5-A FP8 multiply-add instructions
 // CHECK-NEXT:     fprcvt              FEAT_FPRCVT                                            Enable Armv9.6-A base convert instructions for SIMD&FP scalar register operands of different input and output sizes
 // CHECK-NEXT:     fp16                FEAT_FP16                                              Enable half-precision floating-point data processing
+// CHECK-NEXT:     gcie                FEAT_GCIE                                              Enable GICv5 (Generic Interrupt Controller) CPU Interface Extension
 // CHECK-NEXT:     gcs                 FEAT_GCS                                               Enable Armv9.4-A Guarded Call Stack Extension
 // CHECK-NEXT:     hbc                 FEAT_HBC                                               Enable Armv8.8-A Hinted Conditional Branches Extension
 // CHECK-NEXT:     i8mm                FEAT_I8MM                                              Enable Matrix Multiply Int8 Extension
 // CHECK-NEXT:     ite                 FEAT_ITE                                               Enable Armv9.4-A Instrumentation Extension
 // CHECK-NEXT:     jscvt               FEAT_JSCVT                                             Enable Armv8.3-A JavaScript FP conversion instructions
 // CHECK-NEXT:     ls64                FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA              Enable Armv8.7-A LD64B/ST64B Accelerator Extension
+// CHECK-NEXT:     lscp                FEAT_LSCP                                              Enable Armv9.7-A Load-acquire and store-release pair extension
 // CHECK-NEXT:     lse                 FEAT_LSE                                               Enable Armv8.1-A Large System Extension (LSE) atomic instructions
 // CHECK-NEXT:     lse128              FEAT_LSE128                                            Enable Armv9.4-A 128-bit Atomic instructions
 // CHECK-NEXT:     lsfe                FEAT_LSFE                                              Enable Armv9.6-A base Atomic floating-point in-memory instructions
 // CHECK-NEXT:     lsui                FEAT_LSUI                                              Enable Armv9.6-A unprivileged load/store instructions
 // CHECK-NEXT:     lut                 FEAT_LUT                                               Enable Lookup Table instructions
 // CHECK-NEXT:     mops                FEAT_MOPS                                              Enable Armv8.8-A memcpy and memset acceleration instructions
+// CHECK-NEXT:     mpamv2              FEAT_MPAMv2                                            Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions
 // CHECK-NEXT:     memtag              FEAT_MTE, FEAT_MTE2                                    Enable Memory Tagging Extension
+// CHECK-NEXT:     mtetc               FEAT_MTETC                                             Enable Virtual Memory Tagging Extension
 // CHECK-NEXT:     simd                FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
 // CHECK-NEXT:     occmo               FEAT_OCCMO                                             Enable Armv9.6-A Outer cacheable cache maintenance operations
 // CHECK-NEXT:     pauth               FEAT_PAuth                                             Enable Armv8.3-A Pointer Authentication extension
@@ -76,6 +84,7 @@
 // CHECK-NEXT:     sme2                FEAT_SME2                                              Enable Scalable Matrix Extension 2 (SME2) instructions
 // CHECK-NEXT:     sme2p1              FEAT_SME2p1                                            Enable Scalable Matrix Extension 2.1 instructions
 // CHECK-NEXT:     sme2p2              FEAT_SME2p2                                            Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions
+// CHECK-NEXT:     sme2p3              FEAT_SME2p3                                            Enable Armv9.7-A Scalable Matrix Extension 2.3 instructions
 // CHECK-NEXT:     profile             FEAT_SPE                                               Enable Statistical Profiling extension
 // CHECK-NEXT:     predres2            FEAT_SPECRES2                                          Enable Speculation Restriction Instruction
 // CHECK-NEXT:     ssbs                FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
@@ -89,6 +98,7 @@
 // CHECK-NEXT:     sve-aes             FEAT_SVE_AES, FEAT_SVE_PMULL128                        Enable SVE AES and quadword SVE polynomial multiply instructions
 // CHECK-NEXT:     sve-aes2            FEAT_SVE_AES2                                          Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions
 // CHECK-NEXT:     sve-b16b16          FEAT_SVE_B16B16                                        Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions
+// CHECK-NEXT:     sve-b16mm           FEAT_SVE_B16MM                                         Enable Armv9.7-A SVE non-widening BFloat16 matrix multiply-accumulate
 // CHECK-NEXT:     sve-bfscale         FEAT_SVE_BFSCALE                                       Enable Armv9.6-A SVE BFloat16 scaling instructions
 // CHECK-NEXT:     sve-bitperm         FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
 // CHECK-NEXT:     sve-f16f32mm        FEAT_SVE_F16F32MM                                      Enable Armv9.6-A FP16 to FP32 Matrix Multiply
@@ -101,7 +111,9 @@
 // CHECK-NEXT:     sve2-sm4                                                                   Shorthand for +sve2+sve-sm4
 // CHECK-NEXT:     sve2p1              FEAT_SVE2p1                                            Enable Scalable Vector Extension 2.1 instructions
 // CHECK-NEXT:     sve2p2              FEAT_SVE2p2                                            Enable Armv9.6-A Scalable Vector Extension 2.2 instructions
+// CHECK-NEXT:     sve2p3              FEAT_SVE2p3                                            Enable Armv9.7-A Scalable Vector Extension 2.3 instructions
 // CHECK-NEXT:     the                 FEAT_THE                                               Enable Armv8.9-A Translation Hardening Extension
+// CHECK-NEXT:     tlbid               FEAT_TLBID                                             Enable Armv9.7-A TLBI Domains extension
 // CHECK-NEXT:     tlbiw               FEAT_TLBIW                                             Enable Armv9.5-A TLBI VMALL for Dirty State
 // CHECK-NEXT:     tme                 FEAT_TME                                               Enable Transactional Memory Extension
 // CHECK-NEXT:     wfxt                FEAT_WFxT                                              Enable Armv8.7-A WFET and WFIT instruction
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index d720120..4dd243e 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -215,6 +215,7 @@
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9.4-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9.5-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9.6-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // CHECK-SVE2: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-SVE2: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
@@ -691,6 +692,7 @@
 // RUN: %clang -target aarch64-none-elf -march=armv9.4-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.5-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.6-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-none-elf -march=armv9.7-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // CHECK-V81-OR-LATER: __ARM_FEATURE_ATOMICS 1
 // CHECK-V85-OR-LATER: __ARM_FEATURE_BTI 1
 // CHECK-V83-OR-LATER: __ARM_FEATURE_COMPLEX 1
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index fc37beb..689cd81 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -934,6 +934,11 @@
 // CHECK-V96A: #define __ARM_ARCH_9_6A__ 1
 // CHECK-V96A: #define __ARM_ARCH_PROFILE 'A'
 
+// RUN: %clang -target armv9.7a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V97A %s
+// CHECK-V97A: #define __ARM_ARCH 9
+// CHECK-V97A: #define __ARM_ARCH_9_7A__ 1
+// CHECK-V97A: #define __ARM_ARCH_PROFILE 'A'
+
 // RUN: %clang -target arm-none-none-eabi -march=armv7-m -mfpu=softvfp -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SOFTVFP %s
 // CHECK-SOFTVFP-NOT: #define __ARM_FP 0x
 
diff --git a/clang/test/SemaCXX/ptrauth-nested-incomplete-types.cpp b/clang/test/SemaCXX/ptrauth-nested-incomplete-types.cpp
new file mode 100644
index 0000000..8fad2fb
--- /dev/null
+++ b/clang/test/SemaCXX/ptrauth-nested-incomplete-types.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -fptrauth-intrinsics -fsyntax-only -ferror-limit 1 -verify -std=c++26 %s
+// RUN: %clang_cc1 -fptrauth-intrinsics -fsyntax-only -ferror-limit 1 -verify -std=c++03 %s
+// RUN: %clang_cc1                      -fsyntax-only -ferror-limit 1 -verify -std=c++03 %s
+
+/// Force two errors so we hit the error limit leading to skip of template instantiation
+# "" // expected-error {{invalid preprocessing directive}}
+# ""
+// expected-error@* {{too many errors emitted}}
+
+template <typename>
+struct a {};
+struct test_polymorphic {
+  virtual ~test_polymorphic();
+  a<int> field;
+};
+static_assert(__is_trivially_relocatable(test_polymorphic));
+
+struct test_struct {
+  test_struct(int) {}
+  void test_instantiate() {
+    test_struct d(0);
+  }
+  void test_type_trait_query() {
+    __is_trivially_relocatable(test_struct);
+  }
+  a<int> e;
+};
+
+struct test_struct2 {
+  test_struct member;
+  void test() {
+    test_struct2 t{.member = {0}};
+  }
+};
+
+struct test_subclass : test_struct {
+   test_subclass() : test_struct(0) {
+   }
+
+   void test_subclass_instantiation() {
+    test_subclass subclass{};
+   }
+  void test_subclass_type_trait_query() {
+    __is_trivially_relocatable(test_subclass);
+  }
+};
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 0f613f0..8643271 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -58,7 +58,7 @@ namespace __asan {
 
 static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
 #if SANITIZER_INTERCEPT_STRNLEN
-  if (REAL(strnlen))
+  if (static_cast<bool>(REAL(strnlen)))
     return REAL(strnlen)(s, maxlen);
 #  endif
   return internal_strnlen(s, maxlen);
@@ -66,7 +66,7 @@ static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
 
 static inline uptr MaybeRealWcsnlen(const wchar_t* s, uptr maxlen) {
 #  if SANITIZER_INTERCEPT_WCSNLEN
-  if (REAL(wcsnlen))
+  if (static_cast<bool>(REAL(wcsnlen)))
     return REAL(wcsnlen)(s, maxlen);
 #  endif
   return internal_wcsnlen(s, maxlen);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6b02fef..39bac81 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -3106,7 +3106,9 @@ IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
           .getResult(0);
   auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
       builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
-  return mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  mlir::Value boolResult =
+      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  return builder.createConvert(loc, resultType, boolResult);
 }
 
 mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 7d6caf5..5c4c3c6 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -479,3 +479,16 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+
+attributes(device) subroutine testAtomicCasLoop(aa, n)
+  integer :: a
+  do while (atomiccas(a, 0, 1) == 1)
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtestatomiccasloop
+! CHECK: %[[CMP_XCHG:.*]] = llvm.cmpxchg %15, %c0_i32, %c1_i32 acq_rel monotonic : !llvm.ptr, i32
+! CHECK: %[[CMP_XCHG_EV:.*]] = llvm.extractvalue %[[CMP_XCHG]][1] : !llvm.struct<(i32, i1)> 
+! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32
+! CHECK: %{{.*}} = arith.constant 1 : i32
+! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32
diff --git a/lldb/test/Shell/lldb-server/TestGdbserverErrorMessages.test b/lldb/test/Shell/lldb-server/TestErrorMessages.test
index b9689fb..b9689fb 100644
--- a/lldb/test/Shell/lldb-server/TestGdbserverErrorMessages.test
+++ b/lldb/test/Shell/lldb-server/TestErrorMessages.test
diff --git a/lldb/test/Shell/lldb-server/TestPlatformErrorMessages.test b/lldb/test/Shell/lldb-server/TestPlatformErrorMessages.test
deleted file mode 100644
index 7d3b37a..0000000
--- a/lldb/test/Shell/lldb-server/TestPlatformErrorMessages.test
+++ /dev/null
@@ -1,25 +0,0 @@
-RUN: %platformserver 2>&1 | FileCheck --check-prefixes=NO_LISTEN,ALL %s
-NO_LISTEN: error: either --listen or --child-platform-fd is required
-
-RUN: %lldb-server platform --listen 2>&1 | FileCheck --check-prefixes=LISTEN_MISSING,ALL %s
-LISTEN_MISSING: error: --listen: missing argument
-
-RUN: %lldb-server p --bogus 2>&1 | FileCheck --check-prefixes=BOGUS,ALL %s
-BOGUS: error: unknown argument '--bogus'
-
-RUN: %platformserver --gdbserver-port 2>&1 | FileCheck --check-prefixes=GDBPORT_MISSING,ALL %s
-GDBPORT_MISSING: error: --gdbserver-port: missing argument
-
-RUN: %platformserver --gdbserver-port notanumber --listen :1234 2>&1 | FileCheck --check-prefixes=GDBPORT_INVALID %s
-GDBPORT_INVALID: error: invalid --gdbserver-port value
-
-RUN: %platformserver --socket-file 2>&1 | FileCheck --check-prefixes=SOCKETFILE_MISSING,ALL %s
-SOCKETFILE_MISSING: error: --socket-file: missing argument
-
-RUN: %platformserver --log-file 2>&1 | FileCheck --check-prefixes=LOGFILE_MISSING,ALL %s
-LOGFILE_MISSING: error: --log-file: missing argument
-
-RUN: %platformserver --log-channels 2>&1 | FileCheck --check-prefixes=LOGCHANNELS_MISSING,ALL %s
-LOGCHANNELS_MISSING: error: --log-channels: missing argument
-
-ALL: Use 'lldb-server{{(\.exe)?}} {{p|platform}} --help' for a complete list of options.
diff --git a/lldb/test/Shell/lldb-server/TestPlatformHelp.test b/lldb/test/Shell/lldb-server/TestPlatformHelp.test
deleted file mode 100644
index c5ced8a..0000000
--- a/lldb/test/Shell/lldb-server/TestPlatformHelp.test
+++ /dev/null
@@ -1,40 +0,0 @@
-RUN: %platformserver --help 2>&1 | FileCheck %s
-RUN: %platformserver -h 2>&1 | FileCheck %s
-RUN: %lldb-server p --help 2>&1 | FileCheck %s
-RUN: %lldb-server p -h 2>&1 | FileCheck %s
-RUN: %lldb-server platform --help 2>&1 | FileCheck %s
-RUN: %lldb-server platform -h 2>&1 | FileCheck %s
-
-CHECK: OVERVIEW: lldb-server{{(\.exe)?}} platform
-
-CHECK: USAGE: lldb-server{{(\.exe)?}} {{p|platform}} [options] --listen <[host]:port> {{\[}}[--] program args...]
-
-CHECK: CONNECTION OPTIONS:
-CHECK: --gdbserver-port <port>
-CHECK-SAME: Short form: -P
-CHECK: --listen <[host]:port>
-CHECK-SAME: Short form: -L
-CHECK: --socket-file <path>
-CHECK-SAME: Short form: -f
-
-CHECK: GENERAL OPTIONS:
-CHECK: --help
-CHECK: --log-channels <channel1 categories...:channel2 categories...>
-CHECK: Short form: -c
-CHECK: --log-file <file>
-CHECK-SAME: Short form: -l
-CHECK: --server
-
-CHECK: OPTIONS:
-CHECK: -- program args
-
-CHECK: DESCRIPTION
-CHECK: Acts as a platform server for remote debugging
-
-CHECK: EXAMPLES
-CHECK: # Listen on port 1234, exit after first connection
-CHECK: lldb-server{{(\.exe)?}} platform --listen tcp://0.0.0.0:1234
-CHECK: # Listen on port 5555, accept multiple connections
-CHECK: lldb-server{{(\.exe)?}} platform --server --listen tcp://localhost:5555
-CHECK: # Listen on Unix domain socket
-CHECK: lldb-server{{(\.exe)?}} platform --listen unix:///tmp/lldb-server.sock
diff --git a/lldb/test/Shell/lldb-server/TestPlatformSuccessfulStartup.test b/lldb/test/Shell/lldb-server/TestPlatformSuccessfulStartup.test
deleted file mode 100644
index 88a2384..0000000
--- a/lldb/test/Shell/lldb-server/TestPlatformSuccessfulStartup.test
+++ /dev/null
@@ -1,35 +0,0 @@
-# Test successful startup with valid TCP listen address
-# The socket file is created immediately when the server is ready to accept connections,
-# so we can verify successful startup without arbitrary sleep delays.
-RUN: rm -f %t.socket1
-RUN: timeout 0.2s %platformserver --listen tcp://127.0.0.1:0 --socket-file %t.socket1 > %t.out1 2>&1 || true
-RUN: test -f %t.socket1
-RUN: FileCheck --allow-empty --check-prefix=NO-ERROR %s < %t.out1
-
-# Test successful startup with valid gdbserver-port
-RUN: rm -f %t.socket3
-RUN: timeout 0.2s %platformserver --listen tcp://127.0.0.1:0 --gdbserver-port 0 --socket-file %t.socket3 > %t.out3 2>&1 || true
-RUN: test -f %t.socket3
-RUN: FileCheck --allow-empty --check-prefix=NO-ERROR %s < %t.out3
-
-# Test successful startup with specific valid gdbserver-port number
-RUN: rm -f %t.socket4
-RUN: timeout 0.2s %platformserver --listen tcp://127.0.0.1:0 --gdbserver-port 12345 --socket-file %t.socket4 > %t.out4 2>&1 || true
-RUN: test -f %t.socket4
-RUN: FileCheck --allow-empty --check-prefix=NO-ERROR %s < %t.out4
-
-# Test successful startup with server mode (accepting multiple connections)
-RUN: rm -f %t.socket5
-RUN: timeout 0.2s %platformserver --server --listen tcp://127.0.0.1:0 --socket-file %t.socket5 > %t.out5 2>&1 || true
-RUN: test -f %t.socket5
-RUN: FileCheck --allow-empty --check-prefix=NO-ERROR %s < %t.out5
-
-# Test successful startup with abbreviated 'p' command
-RUN: rm -f %t.socket6
-RUN: timeout 0.2s %lldb-server p --listen tcp://127.0.0.1:0 --socket-file %t.socket6 > %t.out6 2>&1 || true
-RUN: test -f %t.socket6
-RUN: FileCheck --allow-empty --check-prefix=NO-ERROR %s < %t.out6
-
-# Verify no error or warning messages appear in successful startup
-NO-ERROR-NOT: error:
-NO-ERROR-NOT: warning:
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h b/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
index f33b320..f258c59 100644
--- a/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
@@ -46,7 +46,7 @@ __attribute__((aligned(alignof(unsigned int))));
 #define ARM_SME_ZA_STATE9 40
 #define ARM_SME_ZA_STATE10 41
 #define ARM_SME_ZA_STATE11 42
-#define ARM_SME_ZA_STATE12 42
+#define ARM_SME_ZA_STATE12 43
 #define ARM_SME_ZA_STATE13 44
 #define ARM_SME_ZA_STATE14 45
 #define ARM_SME_ZA_STATE15 46
diff --git a/lldb/tools/lldb-server/CMakeLists.txt b/lldb/tools/lldb-server/CMakeLists.txt
index fb55c64..1d8dc72 100644
--- a/lldb/tools/lldb-server/CMakeLists.txt
+++ b/lldb/tools/lldb-server/CMakeLists.txt
@@ -2,10 +2,6 @@ set(LLVM_TARGET_DEFINITIONS LLGSOptions.td)
 tablegen(LLVM LLGSOptions.inc -gen-opt-parser-defs)
 add_public_tablegen_target(LLGSOptionsTableGen)
 
-set(LLVM_TARGET_DEFINITIONS PlatformOptions.td)
-tablegen(LLVM PlatformOptions.inc -gen-opt-parser-defs)
-add_public_tablegen_target(PlatformOptionsTableGen)
-
 set(LLDB_PLUGINS)
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
@@ -71,7 +67,6 @@ add_lldb_tool(lldb-server
 
 add_dependencies(lldb-server
   LLGSOptionsTableGen
-  PlatformOptionsTableGen
   ${tablegen_deps}
 )
 target_include_directories(lldb-server PRIVATE "${LLDB_SOURCE_DIR}/source")
diff --git a/lldb/tools/lldb-server/PlatformOptions.td b/lldb/tools/lldb-server/PlatformOptions.td
deleted file mode 100644
index eedd1d8..0000000
--- a/lldb/tools/lldb-server/PlatformOptions.td
+++ /dev/null
@@ -1,75 +0,0 @@
-include "llvm/Option/OptParser.td"
-
-class F<string name>: Flag<["--", "-"], name>;
-class R<list<string> prefixes, string name>
-  : Option<prefixes, name, KIND_REMAINING_ARGS>;
-
-multiclass SJ<string name, string help> {
-  def NAME: Separate<["--", "-"], name>,
-    HelpText<help>;
-  def NAME # _eq: Joined<["--", "-"], name # "=">,
-    Alias<!cast<Separate>(NAME)>;
-}
-
-def grp_connect : OptionGroup<"connection">, HelpText<"CONNECTION OPTIONS">;
-
-defm listen: SJ<"listen", "Host and port to listen on. Format: [host]:port or protocol://[host]:port (e.g., tcp://localhost:1234, unix:///path/to/socket). Short form: -L">,
-  MetaVarName<"<[host]:port>">,
-  Group<grp_connect>;
-def: Separate<["-"], "L">, Alias<listen>,
-  Group<grp_connect>;
-
-defm socket_file: SJ<"socket-file", "Write listening socket information (port number for TCP or path for Unix domain sockets) to the specified file. Short form: -f">,
-  MetaVarName<"<path>">,
-  Group<grp_connect>;
-def: Separate<["-"], "f">, Alias<socket_file>,
-  Group<grp_connect>;
-
-defm gdbserver_port: SJ<"gdbserver-port", "Port to use for spawned gdbserver instances. If 0 or unspecified, a port will be chosen automatically. Short form: -P">,
-  MetaVarName<"<port>">,
-  Group<grp_connect>;
-def: Separate<["-"], "P">, Alias<gdbserver_port>,
-  Group<grp_connect>;
-
-defm child_platform_fd: SJ<"child-platform-fd", "File descriptor for communication with parent platform process (internal use only).">,
-  MetaVarName<"<fd>">,
-  Group<grp_connect>,
-  Flags<[HelpHidden]>;
-
-def grp_general : OptionGroup<"general options">, HelpText<"GENERAL OPTIONS">;
-
-def server: F<"server">,
-  HelpText<"Run in server mode, accepting multiple client connections sequentially. Without this flag, the server exits after handling the first connection.">,
-  Group<grp_general>;
-
-defm log_channels: SJ<"log-channels", "Channels to log. A colon-separated list of entries. Each entry starts with a channel followed by a space-separated list of categories. Common channels: lldb, gdb-remote, platform, process. Short form: -c">,
-  MetaVarName<"<channel1 categories...:channel2 categories...>">,
-  Group<grp_general>;
-def: Separate<["-"], "c">, Alias<log_channels>,
-  Group<grp_general>;
-
-defm log_file: SJ<"log-file", "Destination file to log to. If empty, log to stderr. Short form: -l">,
-  MetaVarName<"<file>">,
-  Group<grp_general>;
-def: Separate<["-"], "l">, Alias<log_file>,
-  Group<grp_general>;
-
-def debug: F<"debug">,
-  HelpText<"(Unused, kept for backward compatibility)">,
-  Group<grp_general>,
-  Flags<[HelpHidden]>;
-
-def verbose: F<"verbose">,
-  HelpText<"(Unused, kept for backward compatibility)">,
-  Group<grp_general>,
-  Flags<[HelpHidden]>;
-
-def help: F<"help">, 
-  HelpText<"Display this help message and exit.">,
-  Group<grp_general>;
-def: Flag<["-"], "h">, Alias<help>,
-  Group<grp_general>;
-
-def REM : R<["--"], "">, 
-  HelpText<"Arguments to pass to launched gdbserver instances.">,
-  MetaVarName<"program args">;
diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp
index 59b1eb4..0bd9285 100644
--- a/lldb/tools/lldb-server/lldb-platform.cpp
+++ b/lldb/tools/lldb-server/lldb-platform.cpp
@@ -21,9 +21,6 @@
 #include <fstream>
 #include <optional>
 
-#include "llvm/Option/ArgList.h"
-#include "llvm/Option/OptTable.h"
-#include "llvm/Option/Option.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/WithColor.h"
@@ -59,69 +56,22 @@ using namespace llvm;
 // of target CPUs. For now, let's just use 100.
 static const int backlog = 100;
 static const int socket_error = -1;
-
-namespace {
-using namespace llvm::opt;
-
-enum ID {
-  OPT_INVALID = 0, // This is not an option ID.
-#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
-#include "PlatformOptions.inc"
-#undef OPTION
-};
-
-#define OPTTABLE_STR_TABLE_CODE
-#include "PlatformOptions.inc"
-#undef OPTTABLE_STR_TABLE_CODE
-
-#define OPTTABLE_PREFIXES_TABLE_CODE
-#include "PlatformOptions.inc"
-#undef OPTTABLE_PREFIXES_TABLE_CODE
-
-static constexpr opt::OptTable::Info InfoTable[] = {
-#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
-#include "PlatformOptions.inc"
-#undef OPTION
-};
-
-class PlatformOptTable : public opt::GenericOptTable {
-public:
-  PlatformOptTable()
-      : opt::GenericOptTable(OptionStrTable, OptionPrefixesTable, InfoTable) {}
-
-  void PrintHelp(llvm::StringRef Name) {
-    std::string Usage =
-        (Name + " [options] --listen <[host]:port> [[--] program args...]")
-            .str();
-
-    std::string Title = "lldb-server platform";
-
-    OptTable::printHelp(llvm::outs(), Usage.c_str(), Title.c_str());
-
-    llvm::outs() << R"(
-DESCRIPTION
-  Acts as a platform server for remote debugging. When LLDB clients connect,
-  the platform server handles platform operations (file transfers, process
-  launching) and spawns debug server instances (lldb-server gdbserver) to
-  handle actual debugging sessions.
-
-  By default, the server exits after handling one connection. Use --server
-  to keep running and accept multiple connections sequentially.
-
-EXAMPLES
-  # Listen on port 1234, exit after first connection
-  lldb-server platform --listen tcp://0.0.0.0:1234
-
-  # Listen on port 5555, accept multiple connections
-  lldb-server platform --server --listen tcp://localhost:5555
-
-  # Listen on Unix domain socket
-  lldb-server platform --listen unix:///tmp/lldb-server.sock
-
-)";
-  }
-};
-} // namespace
+static int g_debug = 0;
+static int g_verbose = 0;
+static int g_server = 0;
+
+// option descriptors for getopt_long_only()
+static struct option g_long_options[] = {
+    {"debug", no_argument, &g_debug, 1},
+    {"verbose", no_argument, &g_verbose, 1},
+    {"log-file", required_argument, nullptr, 'l'},
+    {"log-channels", required_argument, nullptr, 'c'},
+    {"listen", required_argument, nullptr, 'L'},
+    {"gdbserver-port", required_argument, nullptr, 'P'},
+    {"socket-file", required_argument, nullptr, 'f'},
+    {"server", no_argument, &g_server, 1},
+    {"child-platform-fd", required_argument, nullptr, 2},
+    {nullptr, 0, nullptr, 0}};
 
 #if defined(__APPLE__)
 #define LOW_PORT (IPPORT_RESERVED)
@@ -147,11 +97,12 @@ static void signal_handler(int signo) {
 }
 #endif
 
-static void display_usage(PlatformOptTable &Opts, const char *progname,
-                          const char *subcommand) {
-  std::string Name =
-      (llvm::sys::path::filename(progname) + " " + subcommand).str();
-  Opts.PrintHelp(Name);
+static void display_usage(const char *progname, const char *subcommand) {
+  fprintf(stderr, "Usage:\n  %s %s [--log-file log-file-name] [--log-channels "
+                  "log-channel-list] [--port-file port-file-path] --server "
+                  "--listen port\n",
+          progname, subcommand);
+  exit(0);
 }
 
 static Status parse_listen_host_port(Socket::SocketProtocol &protocol,
@@ -310,8 +261,7 @@ static Status spawn_process(const char *progname, const FileSpec &prog,
                             const Socket *conn_socket, uint16_t gdb_port,
                             const lldb_private::Args &args,
                             const std::string &log_file,
-                            const StringRef log_channels, MainLoop &main_loop,
-                            bool multi_client) {
+                            const StringRef log_channels, MainLoop &main_loop) {
   Status error;
   SharedSocket shared_socket(conn_socket, error);
   if (error.Fail())
@@ -347,12 +297,9 @@ static Status spawn_process(const char *progname, const FileSpec &prog,
 
   launch_info.SetLaunchInSeparateProcessGroup(false);
 
-  // Set up process monitor callback based on whether we're in server mode.
-  if (multi_client)
-    // In server mode: empty callback (don't terminate when child exits).
+  if (g_server)
     launch_info.SetMonitorProcessCallback([](lldb::pid_t, int, int) {});
   else
-    // In single-client mode: terminate main loop when child exits.
     launch_info.SetMonitorProcessCallback([&main_loop](lldb::pid_t, int, int) {
       main_loop.AddPendingCallback(
           [](MainLoopBase &loop) { loop.RequestTermination(); });
@@ -424,101 +371,107 @@ int main_platform(int argc, char *argv[]) {
   signal(SIGPIPE, SIG_IGN);
   signal(SIGHUP, signal_handler);
 #endif
+  int long_option_index = 0;
+  Status error;
+  std::string listen_host_port;
+  int ch;
 
-  // Special handling for 'help' as first argument.
-  if (argc > 0 && strcmp(argv[0], "help") == 0) {
-    PlatformOptTable Opts;
-    display_usage(Opts, progname, subcommand);
-    return EXIT_SUCCESS;
-  }
+  std::string log_file;
+  StringRef
+      log_channels; // e.g. "lldb process threads:gdb-remote default:linux all"
 
-  Status error;
   shared_fd_t fd = SharedSocket::kInvalidFD;
+
   uint16_t gdbserver_port = 0;
+
   FileSpec socket_file;
+  bool show_usage = false;
+  int option_error = 0;
 
-  PlatformOptTable Opts;
-  BumpPtrAllocator Alloc;
-  StringSaver Saver(Alloc);
-  bool HasError = false;
+  std::string short_options(OptionParser::GetShortOptionString(g_long_options));
 
-  opt::InputArgList Args =
-      Opts.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](llvm::StringRef Msg) {
-        WithColor::error() << Msg << "\n";
-        HasError = true;
-      });
+#if __GLIBC__
+  optind = 0;
+#else
+  optreset = 1;
+  optind = 1;
+#endif
 
-  std::string Name =
-      (llvm::sys::path::filename(progname) + " " + subcommand).str();
-  std::string HelpText =
-      "Use '" + Name + " --help' for a complete list of options.\n";
+  while ((ch = getopt_long_only(argc, argv, short_options.c_str(),
+                                g_long_options, &long_option_index)) != -1) {
+    switch (ch) {
+    case 0: // Any optional that auto set themselves will return 0
+      break;
 
-  if (HasError) {
-    llvm::errs() << HelpText;
-    return EXIT_FAILURE;
-  }
+    case 'L':
+      listen_host_port.append(optarg);
+      break;
 
-  if (Args.hasArg(OPT_help)) {
-    display_usage(Opts, progname, subcommand);
-    return EXIT_SUCCESS;
-  }
+    case 'l': // Set Log File
+      if (optarg && optarg[0])
+        log_file.assign(optarg);
+      break;
 
-  // Parse arguments.
-  std::string listen_host_port = Args.getLastArgValue(OPT_listen).str();
-  std::string log_file = Args.getLastArgValue(OPT_log_file).str();
-  StringRef log_channels = Args.getLastArgValue(OPT_log_channels);
-  bool multi_client = Args.hasArg(OPT_server);
-  [[maybe_unused]] bool debug = Args.hasArg(OPT_debug);
-  [[maybe_unused]] bool verbose = Args.hasArg(OPT_verbose);
-
-  if (Args.hasArg(OPT_socket_file)) {
-    socket_file.SetFile(Args.getLastArgValue(OPT_socket_file),
-                        FileSpec::Style::native);
-  }
+    case 'c': // Log Channels
+      if (optarg && optarg[0])
+        log_channels = StringRef(optarg);
+      break;
 
-  if (Args.hasArg(OPT_gdbserver_port)) {
-    if (!llvm::to_integer(Args.getLastArgValue(OPT_gdbserver_port),
-                          gdbserver_port)) {
-      WithColor::error() << "invalid --gdbserver-port value\n";
-      return EXIT_FAILURE;
-    }
-  }
+    case 'f': // Socket file
+      if (optarg && optarg[0])
+        socket_file.SetFile(optarg, FileSpec::Style::native);
+      break;
 
-  if (Args.hasArg(OPT_child_platform_fd)) {
-    uint64_t _fd;
-    if (!llvm::to_integer(Args.getLastArgValue(OPT_child_platform_fd), _fd)) {
-      WithColor::error() << "invalid --child-platform-fd value\n";
-      return EXIT_FAILURE;
+    case 'P':
+    case 'm':
+    case 'M': {
+      uint16_t portnum;
+      if (!llvm::to_integer(optarg, portnum)) {
+        WithColor::error() << "invalid port number string " << optarg << "\n";
+        option_error = 2;
+        break;
+      }
+      // Note the condition gdbserver_port > HIGH_PORT is valid in case of using
+      // --child-platform-fd. Check gdbserver_port later.
+      if (ch == 'P')
+        gdbserver_port = portnum;
+      else if (gdbserver_port == 0)
+        gdbserver_port = portnum;
+    } break;
+
+    case 2: {
+      uint64_t _fd;
+      if (!llvm::to_integer(optarg, _fd)) {
+        WithColor::error() << "invalid fd " << optarg << "\n";
+        option_error = 6;
+      } else
+        fd = (shared_fd_t)_fd;
+    } break;
+
+    case 'h': /* fall-through is intentional */
+    case '?':
+      show_usage = true;
+      break;
     }
-    fd = (shared_fd_t)_fd;
   }
 
   if (!LLDBServerUtilities::SetupLogging(log_file, log_channels, 0))
     return -1;
 
   // Print usage and exit if no listening port is specified.
-  if (listen_host_port.empty() && fd == SharedSocket::kInvalidFD) {
-    WithColor::error() << "either --listen or --child-platform-fd is required\n"
-                       << HelpText;
-    return EXIT_FAILURE;
-  }
+  if (listen_host_port.empty() && fd == SharedSocket::kInvalidFD)
+    show_usage = true;
 
-  // Get remaining arguments for inferior.
-  std::vector<llvm::StringRef> Inputs;
-  for (opt::Arg *Arg : Args.filtered(OPT_INPUT))
-    Inputs.push_back(Arg->getValue());
-  if (opt::Arg *Arg = Args.getLastArg(OPT_REM)) {
-    for (const char *Val : Arg->getValues())
-      Inputs.push_back(Val);
+  if (show_usage || option_error) {
+    display_usage(progname, subcommand);
+    exit(option_error);
   }
 
+  // Skip any options we consumed with getopt_long_only.
+  argc -= optind;
+  argv += optind;
   lldb_private::Args inferior_arguments;
-  if (!Inputs.empty()) {
-    std::vector<const char *> args_ptrs;
-    for (const auto &Input : Inputs)
-      args_ptrs.push_back(Input.data());
-    inferior_arguments.SetArguments(args_ptrs.size(), args_ptrs.data());
-  }
+  inferior_arguments.SetArguments(argc, const_cast<const char **>(argv));
 
   FileSpec debugserver_path = GetDebugserverPath();
   if (!debugserver_path) {
@@ -561,7 +514,7 @@ int main_platform(int argc, char *argv[]) {
     platform.SetConnection(
         std::make_unique<ConnectionFileDescriptor>(std::move(socket)));
     client_handle(platform, inferior_arguments);
-    return EXIT_SUCCESS;
+    return 0;
   }
 
   if (gdbserver_port != 0 &&
@@ -569,7 +522,7 @@ int main_platform(int argc, char *argv[]) {
     WithColor::error() << llvm::formatv("Port number {0} is not in the "
                                         "valid user port range of {1} - {2}\n",
                                         gdbserver_port, LOW_PORT, HIGH_PORT);
-    return EXIT_FAILURE;
+    return 1;
   }
 
   Socket::SocketProtocol protocol = Socket::ProtocolUnixDomain;
@@ -606,7 +559,7 @@ int main_platform(int argc, char *argv[]) {
     if (error.Fail()) {
       fprintf(stderr, "failed to write socket id to %s: %s\n",
               socket_file.GetPath().c_str(), error.AsCString());
-      return EXIT_FAILURE;
+      return 1;
     }
   }
 
@@ -624,22 +577,22 @@ int main_platform(int argc, char *argv[]) {
     llvm::Expected<std::vector<MainLoopBase::ReadHandleUP>> platform_handles =
         platform_sock->Accept(
             main_loop, [progname, gdbserver_port, &inferior_arguments, log_file,
-                        log_channels, &main_loop, multi_client,
+                        log_channels, &main_loop,
                         &platform_handles](std::unique_ptr<Socket> sock_up) {
               printf("Connection established.\n");
               Status error = spawn_process(
                   progname, HostInfo::GetProgramFileSpec(), sock_up.get(),
                   gdbserver_port, inferior_arguments, log_file, log_channels,
-                  main_loop, multi_client);
+                  main_loop);
               if (error.Fail()) {
                 Log *log = GetLog(LLDBLog::Platform);
                 LLDB_LOGF(log, "spawn_process failed: %s", error.AsCString());
                 WithColor::error()
                     << "spawn_process failed: " << error.AsCString() << "\n";
-                if (!multi_client)
+                if (!g_server)
                   main_loop.RequestTermination();
               }
-              if (!multi_client)
+              if (!g_server)
                 platform_handles->clear();
             });
     if (!platform_handles) {
@@ -663,5 +616,5 @@ int main_platform(int argc, char *argv[]) {
 
   fprintf(stderr, "lldb-server exiting...\n");
 
-  return EXIT_SUCCESS;
+  return 0;
 }
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 6bd3278..36383b1 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -94,6 +94,9 @@ Changes to Vectorizers
 Changes to the AArch64 Backend
 ------------------------------
 
+* Assembler/disassembler support has been added for Armv9.7-A (2025)
+  architecture extensions.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index a837cdd..3a4dc5a 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -462,6 +462,19 @@ private:
     ModuleMapType ModuleMap;
     // The bitcode modules to compile, if specified by the LTO Config.
     std::optional<ModuleMapType> ModulesToCompile;
+
+    void setPrevailingModuleForGUID(GlobalValue::GUID GUID, StringRef Module) {
+      PrevailingModuleForGUID[GUID] = Module;
+    }
+    bool isPrevailingModuleForGUID(GlobalValue::GUID GUID,
+                                   StringRef Module) const {
+      auto It = PrevailingModuleForGUID.find(GUID);
+      return It != PrevailingModuleForGUID.end() && It->second == Module;
+    }
+
+  private:
+    // Make this private so all accesses must go through above accessor methods
+    // to avoid inadvertently creating new entries on lookups.
     DenseMap<GlobalValue::GUID, StringRef> PrevailingModuleForGUID;
   } ThinLTO;
 
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 8e83b046..7e68ad2 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -115,9 +115,9 @@ struct ArchInfo {
   // Defines the following partial order, indicating when an architecture is
   // a superset of another:
   //
-  // v9.6a > v9.5a > v9.4a > v9.3a > v9.2a > v9.1a > v9a;
-  //                   v       v       v       v       v
-  //                 v8.9a > v8.8a > v8.7a > v8.6a > v8.5a > v8.4a > ... > v8a;
+  // v9.7a > v9.6a > v9.5a > v9.4a > v9.3a > v9.2a > v9.1a > v9a;
+  //                           v       v       v       v       v
+  //                         v8.9a > v8.8a > v8.7a > v8.6a > v8.5a > ... > v8a;
   //
   // v8r has no relation to anything. This is used to determine which
   // features to enable for a given architecture. See
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.def b/llvm/include/llvm/TargetParser/ARMTargetParser.def
index ff53aa1..0ada2e7 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.def
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.def
@@ -187,6 +187,11 @@ ARM_ARCH("armv9.6-a", ARMV9_6A, "9.6-A", "+v9.6a", ARMBuildAttrs::CPUArch::v9_A,
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
+ARM_ARCH("armv9.7-a", ARMV9_7A, "9.7-A", "+v9.7a", ARMBuildAttrs::CPUArch::v9_A,
+         FK_NEON_FP_ARMV8,
+         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "+v8r", ARMBuildAttrs::CPUArch::v8_R,
          FK_FPV5_SP_D16,
          (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 5e43444..0e82dd2 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -115,6 +115,7 @@ public:
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v9_7a,
     ARMSubArch_v9_6a,
     ARMSubArch_v9_5a,
     ARMSubArch_v9_4a,
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 92a5b6f..b09f4ed 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -241,9 +241,13 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
       ColdBytes += TotalSize;
       // If we have the max cold context size from summary information and have
       // requested identification of contexts above a percentage of the max, see
-      // if this context qualifies.
-      if (MaxColdSize > 0 && MinPercentMaxColdSize < 100 &&
-          TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize)
+      // if this context qualifies. We should assume this is large if we rebuilt
+      // the trie from existing metadata (i.e. to update after inlining), in
+      // which case we don't have a MaxSize from the profile - we assume any
+      // context size info in existence on the metadata should be propagated.
+      if (BuiltFromExistingMetadata ||
+          (MaxColdSize > 0 && MinPercentMaxColdSize < 100 &&
+           TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize))
         LargeColdContext = true;
     }
     // Only add the context size info as metadata if we need it in the thin
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index bfa566a..dee0909 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1162,6 +1162,43 @@ SDValue SelectionDAGBuilder::getMemoryRoot() {
   return updateRoot(PendingLoads);
 }
 
+SDValue SelectionDAGBuilder::getFPOperationRoot(fp::ExceptionBehavior EB) {
+  // If the new exception behavior differs from that of the pending
+  // ones, chain up them and update the root.
+  switch (EB) {
+  case fp::ExceptionBehavior::ebMayTrap:
+  case fp::ExceptionBehavior::ebIgnore:
+    // Floating-point exceptions produced by such operations are not intended
+    // to be observed, so the sequence of these operations does not need to be
+    // preserved.
+    //
+    // They however must not be mixed with the instructions that have strict
+    // exception behavior. Placing an operation with 'ebIgnore' behavior between
+    // 'ebStrict' operations could distort the observed exception behavior.
+    if (!PendingConstrainedFPStrict.empty()) {
+      assert(PendingConstrainedFP.empty());
+      updateRoot(PendingConstrainedFPStrict);
+    }
+    break;
+  case fp::ExceptionBehavior::ebStrict:
+    // Floating-point exception produced by these operations may be observed, so
+    // they must be correctly chained. If trapping on FP exceptions is
+    // disabled, the exceptions can be observed only by functions that read
+    // exception flags, like 'llvm.get_fpenv' or 'fetestexcept'. It means that
+    // the order of operations is not significant between barriers.
+    //
+    // If trapping is enabled, each operation becomes an implicit observation
+    // point, so the operations must be sequenced according their original
+    // source order.
+    if (!PendingConstrainedFP.empty()) {
+      assert(PendingConstrainedFPStrict.empty());
+      updateRoot(PendingConstrainedFP);
+    }
+    // TODO: Add support for trapping-enabled scenarios.
+  }
+  return DAG.getRoot();
+}
+
 SDValue SelectionDAGBuilder::getRoot() {
   // Chain up all pending constrained intrinsics together with all
   // pending loads, by simply appending them to PendingLoads and
@@ -8298,6 +8335,30 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   }
 }
 
+void SelectionDAGBuilder::pushFPOpOutChain(SDValue Result,
+                                           fp::ExceptionBehavior EB) {
+  assert(Result.getNode()->getNumValues() == 2);
+  SDValue OutChain = Result.getValue(1);
+  assert(OutChain.getValueType() == MVT::Other);
+
+  // Instead of updating the root immediately, push the produced chain to the
+  // appropriate list, deferring the update until the root is requested. In this
+  // case, the nodes from the lists are chained using TokenFactor, indicating
+  // that the operations are independent.
+  //
+  // In particular, the root is updated before any call that might access the
+  // floating-point environment, except for constrained intrinsics.
+  switch (EB) {
+  case fp::ExceptionBehavior::ebMayTrap:
+  case fp::ExceptionBehavior::ebIgnore:
+    PendingConstrainedFP.push_back(OutChain);
+    break;
+  case fp::ExceptionBehavior::ebStrict:
+    PendingConstrainedFPStrict.push_back(OutChain);
+    break;
+  }
+}
+
 void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
     const ConstrainedFPIntrinsic &FPI) {
   SDLoc sdl = getCurSDLoc();
@@ -8305,42 +8366,16 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   // We do not need to serialize constrained FP intrinsics against
   // each other or against (nonvolatile) loads, so they can be
   // chained like loads.
-  SDValue Chain = DAG.getRoot();
+  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
+  SDValue Chain = getFPOperationRoot(EB);
   SmallVector<SDValue, 4> Opers;
   Opers.push_back(Chain);
   for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I)
     Opers.push_back(getValue(FPI.getArgOperand(I)));
 
-  auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
-    assert(Result.getNode()->getNumValues() == 2);
-
-    // Push node to the appropriate list so that future instructions can be
-    // chained up correctly.
-    SDValue OutChain = Result.getValue(1);
-    switch (EB) {
-    case fp::ExceptionBehavior::ebIgnore:
-      // The only reason why ebIgnore nodes still need to be chained is that
-      // they might depend on the current rounding mode, and therefore must
-      // not be moved across instruction that may change that mode.
-      [[fallthrough]];
-    case fp::ExceptionBehavior::ebMayTrap:
-      // These must not be moved across calls or instructions that may change
-      // floating-point exception masks.
-      PendingConstrainedFP.push_back(OutChain);
-      break;
-    case fp::ExceptionBehavior::ebStrict:
-      // These must not be moved across calls or instructions that may change
-      // floating-point exception masks or read floating-point exception flags.
-      // In addition, they cannot be optimized out even if unused.
-      PendingConstrainedFPStrict.push_back(OutChain);
-      break;
-    }
-  };
-
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), FPI.getType());
   SDVTList VTs = DAG.getVTList(VT, MVT::Other);
-  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
 
   SDNodeFlags Flags;
   if (EB == fp::ExceptionBehavior::ebIgnore)
@@ -8364,7 +8399,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
         !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
       Opers.pop_back();
       SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags);
-      pushOutChain(Mul, EB);
+      pushFPOpOutChain(Mul, EB);
       Opcode = ISD::STRICT_FADD;
       Opers.clear();
       Opers.push_back(Mul.getValue(1));
@@ -8395,7 +8430,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   }
 
   SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags);
-  pushOutChain(Result, EB);
+  pushFPOpOutChain(Result, EB);
 
   SDValue FPResult = Result.getValue(0);
   setValue(&FPI, FPResult);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c7577fa..47e19f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -195,6 +195,11 @@ private:
   /// Update root to include all chains from the Pending list.
   SDValue updateRoot(SmallVectorImpl<SDValue> &Pending);
 
+  /// Given a node representing a floating-point operation and its specified
+  /// exception behavior, this either updates the root or stores the node in
+  /// a list to be added to chains latter.
+  void pushFPOpOutChain(SDValue Result, fp::ExceptionBehavior EB);
+
   /// A unique monotonically increasing number used to order the SDNodes we
   /// create.
   unsigned SDNodeOrder;
@@ -300,6 +305,13 @@ public:
   /// memory node that may need to be ordered after any prior load instructions.
   SDValue getMemoryRoot();
 
+  /// Return the current virtual root of the Selection DAG, flushing
+  /// PendingConstrainedFP or PendingConstrainedFPStrict items if the new
+  /// exception behavior (specified by \p EB) differs from that of the pending
+  /// instructions. This must be done before emitting constrained FP operation
+  /// call.
+  SDValue getFPOperationRoot(fp::ExceptionBehavior EB);
+
   /// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict)
   /// items. This must be done before emitting any call other any other node
   /// that may need to be ordered after FP instructions due to other side
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 30b5e48..e19336e 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -403,9 +403,14 @@ void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,
                            Metadata *Val) {
   NamedMDNode *ModFlags = getOrInsertModuleFlagsMetadata();
   // Replace the flag if it already exists.
-  for (MDNode *Flag : ModFlags->operands()) {
+  for (unsigned i = 0; i < ModFlags->getNumOperands(); ++i) {
+    MDNode *Flag = ModFlags->getOperand(i);
     if (cast<MDString>(Flag->getOperand(1))->getString() == Key) {
-      Flag->replaceOperandWith(2, Val);
+      Type *Int32Ty = Type::getInt32Ty(Context);
+      Metadata *Ops[3] = {
+          ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Behavior)),
+          MDString::get(Context, Key), Val};
+      ModFlags->setOperand(i, MDNode::get(Context, Ops));
       return;
     }
   }
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 9d0fa11..4bc2a18 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -471,16 +471,14 @@ static void thinLTOInternalizeAndPromoteGUID(
     ValueInfo VI, function_ref<bool(StringRef, ValueInfo)> isExported,
     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing) {
-  auto ExternallyVisibleCopies =
-      llvm::count_if(VI.getSummaryList(),
-                     [](const std::unique_ptr<GlobalValueSummary> &Summary) {
-                       return !GlobalValue::isLocalLinkage(Summary->linkage());
-                     });
-
   // Before performing index-based internalization and promotion for this GUID,
   // the local flag should be consistent with the summary list linkage types.
   VI.verifyLocal();
 
+  const bool SingleExternallyVisibleCopy =
+      VI.getSummaryList().size() == 1 &&
+      !GlobalValue::isLocalLinkage(VI.getSummaryList().front()->linkage());
+
   for (auto &S : VI.getSummaryList()) {
     // First see if we need to promote an internal value because it is not
     // exported.
@@ -543,7 +541,9 @@ static void thinLTOInternalizeAndPromoteGUID(
         GlobalValue::isExternalWeakLinkage(S->linkage()))
       continue;
 
-    if (isPrevailing(VI.getGUID(), S.get()) && ExternallyVisibleCopies == 1)
+    // We may have a single summary copy that is externally visible but not
+    // prevailing if the prevailing copy is in a native object.
+    if (SingleExternallyVisibleCopy && isPrevailing(VI.getGUID(), S.get()))
       S->setLinkage(GlobalValue::InternalLinkage);
   }
 }
@@ -1086,15 +1086,15 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
       if (R.Prevailing)
-        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+        ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier());
     }
   }
 
   if (Error Err =
           BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
                          [&](GlobalValue::GUID GUID) {
-                           return ThinLTO.PrevailingModuleForGUID[GUID] ==
-                                  BM.getModuleIdentifier();
+                           return ThinLTO.isPrevailingModuleForGUID(
+                               GUID, BM.getModuleIdentifier());
                          }))
     return Err;
   LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
@@ -1108,8 +1108,8 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
       if (R.Prevailing) {
-        assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
-               BM.getModuleIdentifier());
+        assert(
+            ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()));
 
         // For linker redefined symbols (via --wrap or --defsym) we want to
         // switch the linkage to `weak` to prevent IPOs from happening.
@@ -1988,7 +1988,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
                                LocalWPDTargetsMap);
 
   auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
-    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+    return ThinLTO.isPrevailingModuleForGUID(GUID, S->modulePath());
   };
   if (EnableMemProfContextDisambiguation) {
     MemProfContextDisambiguation ContextDisambiguation;
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 86f9548..a4529a5 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -73,9 +73,16 @@ def SVEUnsupported : AArch64Unsupported {
                       SVE2Unsupported.F);
 }
 
-let F = [HasSME2p2, HasSVE2p2_or_SME2p2, HasNonStreamingSVE_or_SME2p2,
-         HasNonStreamingSVE2p2_or_SME2p2] in
-def SME2p2Unsupported : AArch64Unsupported;
+def SME2p3Unsupported : AArch64Unsupported {
+  let F = [HasSVE2p3_or_SME2p3, HasSVE_B16MM];
+}
+
+def SME2p2Unsupported : AArch64Unsupported {
+  let F = !listconcat([HasSME2p2, HasSVE2p2_or_SME2p2,
+           HasNonStreamingSVE_or_SME2p2,
+           HasNonStreamingSVE2p2_or_SME2p2],
+           SME2p3Unsupported.F);
+}
 
 def SME2p1Unsupported : AArch64Unsupported {
   let F = !listconcat([HasSME2p1, HasSVE2p1_or_SME2p1,
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 46f5f0c..0e94b78 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -585,6 +585,47 @@ def FeatureSME_TMOP: ExtensionWithMArch<"sme-tmop", "SME_TMOP", "FEAT_SME_TMOP",
 def FeatureSSVE_FEXPA : ExtensionWithMArch<"ssve-fexpa", "SSVE_FEXPA", "FEAT_SSVE_FEXPA",
   "Enable SVE FEXPA instruction in Streaming SVE mode", [FeatureSME2]>;
 
+//===----------------------------------------------------------------------===//
+//  Armv9.7 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureCMH : ExtensionWithMArch<"cmh", "CMH", "FEAT_CMH",
+  "Enable Armv9.7-A Contention Management Hints">;
+
+def FeatureLSCP : ExtensionWithMArch<"lscp", "LSCP", "FEAT_LSCP",
+  "Enable Armv9.7-A Load-acquire and store-release pair extension">;
+
+def FeatureTLBID: ExtensionWithMArch<"tlbid", "TLBID", "FEAT_TLBID",
+  "Enable Armv9.7-A TLBI Domains extension">;
+
+def FeatureMPAMv2: ExtensionWithMArch<"mpamv2", "MPAMv2", "FEAT_MPAMv2",
+  "Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions">;
+
+def FeatureMTETC: ExtensionWithMArch<"mtetc", "MTETC", "FEAT_MTETC",
+  "Enable Virtual Memory Tagging Extension">;
+
+def FeatureGCIE: ExtensionWithMArch<"gcie", "GCIE", "FEAT_GCIE",
+  "Enable GICv5 (Generic Interrupt Controller) CPU Interface Extension">;
+
+def FeatureSVE2p3 : ExtensionWithMArch<"sve2p3", "SVE2p3", "FEAT_SVE2p3",
+  "Enable Armv9.7-A Scalable Vector Extension 2.3 instructions", [FeatureSVE2p2]>;
+
+def FeatureSME2p3 : ExtensionWithMArch<"sme2p3", "SME2p3", "FEAT_SME2p3",
+  "Enable Armv9.7-A Scalable Matrix Extension 2.3 instructions", [FeatureSME2p2]>;
+
+def FeatureSVE_B16MM : ExtensionWithMArch<"sve-b16mm", "SVE_B16MM", "FEAT_SVE_B16MM",
+  "Enable Armv9.7-A SVE non-widening BFloat16 matrix multiply-accumulate", [FeatureSVE]>;
+
+def FeatureF16MM : ExtensionWithMArch<"f16mm", "F16MM", "FEAT_F16MM",
+  "Enable Armv9.7-A non-widening half-precision matrix multiply-accumulate", [FeatureFullFP16]>;
+
+def FeatureF16F32DOT : ExtensionWithMArch<"f16f32dot", "F16F32DOT", "FEAT_F16F32DOT",
+  "Enable Armv9.7-A Advanced SIMD half-precision dot product accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>;
+
+def FeatureF16F32MM : ExtensionWithMArch<"f16f32mm", "F16F32MM", "FEAT_F16F32MM",
+  "Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>;
+
+//===----------------------------------------------------------------------===//
 //  Other Features
 //===----------------------------------------------------------------------===//
 
@@ -939,9 +980,12 @@ def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a",
   [HasV9_4aOps, FeatureCPA],
   !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA,  FeatureLUT, FeatureFAMINMAX])>;
 def HasV9_6aOps : Architecture64<9, 6, "a", "v9.6a",
-  [HasV9_5aOps, FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2, FeatureLSUI, FeatureOCCMO],
-  !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2,
+  [HasV9_5aOps, FeatureCMPBR, FeatureLSUI, FeatureOCCMO],
+  !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR,
     FeatureLSUI, FeatureOCCMO])>;
+def HasV9_7aOps : Architecture64<9, 7, "a", "v9.7a",
+  [HasV9_6aOps, FeatureSVE2p3, FeatureFPRCVT],
+  !listconcat(HasV9_6aOps.DefaultExts, [FeatureSVE2p3, FeatureFPRCVT])>;
 def HasV8_0rOps : Architecture64<8, 0, "r", "v8r",
   [ //v8.1
     FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 09ce713..eab1627 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1894,6 +1894,21 @@ def btihint_op : Operand<i32> {
   }];
 }
 
+def CMHPriorityHintOperand : AsmOperandClass {
+  let Name = "CMHPriorityHint";
+  let ParserMethod = "tryParseCMHPriorityHint";
+}
+
+def CMHPriorityHint_op : Operand<i32> {
+  let ParserMatchClass = CMHPriorityHintOperand;
+  let PrintMethod = "printCMHPriorityHintOp";
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    return AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(MCOp.getImm()) != nullptr;
+  }];
+}
+
 class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
   bits<16> systemreg;
@@ -4636,6 +4651,48 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype,
                                                   GPR64sp:$Rn, 0)>;
 }
 
+class BaseLoadStoreAcquirePairOffset<bits<4> opc, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, #0]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{31-23} = 0b110110010;
+  let Inst{22}    = L;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Rt2;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass LoadAcquirePairOffset<bits<4> opc, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStoreAcquirePairOffset<opc, 0b1,
+                                  (outs GPR64:$Rt, GPR64:$Rt2),
+                                  (ins GPR64sp:$Rn), asm>,
+          Sched<[WriteAtomic, WriteLDHi]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2,
+                                                  GPR64sp:$Rn)>;
+}
+
+
+multiclass StoreAcquirePairOffset<bits<4> opc, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStoreAcquirePairOffset<opc, 0b0, (outs),
+                                  (ins GPR64:$Rt, GPR64:$Rt2,
+                                       GPR64sp:$Rn),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2,
+                                                  GPR64sp:$Rn)>;
+}
+
 // (pre-indexed)
 class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                               string asm>
@@ -6481,8 +6538,7 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
 }
 
 multiclass SIMDThreeSameVectorMLA<bit Q, string asm, SDPatternOperator op> {
-
-  def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
+  def v16i8_v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
                                          V128, v8f16, v16i8, op>;
 }
 
@@ -6491,6 +6547,23 @@ multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm, SDPatternOpera
                                          V128, v4f32, v16i8, op>;
 }
 
+multiclass SIMDThreeSameVectorFMLA<string asm> {
+  def v8f16_v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b11, 0b1101, asm, ".8h", ".8h",
+                                          V128, v8f16, v8f16, null_frag>;
+}
+
+multiclass SIMDThreeSameVectorFMLAWiden<string asm> {
+  def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1101, asm, ".4s", ".8h",
+                                          V128, v4f32, v8f16, null_frag>;
+}
+
+multiclass SIMDThreeSameVectorFDot<string asm, SDPatternOperator OpNode = null_frag> {
+  def v4f16_v2f32 : BaseSIMDThreeSameVectorDot<0, 0, 0b10, 0b1111, asm, ".2s", ".4h", V64,
+                                         v2f32, v4f16, OpNode>;
+  def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<1, 0, 0b10, 0b1111, asm, ".4s", ".8h", V128,
+                                         v4f32, v8f16, OpNode>;
+}
+
 // FP8 assembly/disassembly classes
 
 //----------------------------------------------------------------------------
@@ -9112,6 +9185,13 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                               V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>;
 }
 
+multiclass SIMDThreeSameVectorFDOTIndex<string asm> {
+  def v4f16_v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b01, 0b1001, asm, ".2s", ".4h", ".2h",
+                                           V64, v2f32, v4f16, VectorIndexS, null_frag>;
+  def v8f16_v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b01, 0b1001, asm, ".4s", ".8h",".2h",
+                                            V128, v4f32, v8f16, VectorIndexS, null_frag>;
+}
+
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD vector x indexed element
 multiclass SIMD_FP8_Dot2_Index<string asm, SDPatternOperator op> {
@@ -13227,3 +13307,34 @@ multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
       let Predicates = [HasNEON, HasF8F32MM];
     }
 }
+
+//----------------------------------------------------------------------------
+// Contention Management Hints - FEAT_CMH
+//----------------------------------------------------------------------------
+
+class SHUHInst<string asm> : I<
+    (outs),
+    (ins CMHPriorityHint_op:$priority),
+    asm, "\t$priority", "", []>, Sched<[]> {
+  bits<1> priority;
+  let Inst{31-12} = 0b11010101000000110010;
+  let Inst{11-8}  = 0b0110;
+  let Inst{7-6}   = 0b01;
+  let Inst{5}     = priority;
+  let Inst{4-0}   = 0b11111;
+}
+
+multiclass SHUH<string asm> {
+  def NAME : SHUHInst<asm>;
+  def      : InstAlias<asm, (!cast<Instruction>(NAME) 0), 1>;
+}
+
+class STCPHInst<string asm> : I<
+    (outs),
+    (ins),
+    asm, "", "", []>, Sched<[]> {
+    let Inst{31-12} = 0b11010101000000110010;
+    let Inst{11-8}  = 0b0110;
+    let Inst{7-5}   = 0b100;
+    let Inst{4-0}   = 0b11111;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 92f260f..b74ca79 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -50,63 +50,44 @@ def HasV9_4a         : Predicate<"Subtarget->hasV9_4aOps()">,
                                  AssemblerPredicateWithAll<(all_of HasV9_4aOps), "armv9.4a">;
 def HasV8_0r         : Predicate<"Subtarget->hasV8_0rOps()">,
                                  AssemblerPredicateWithAll<(all_of HasV8_0rOps), "armv8-r">;
-
 def HasEL2VMSA       : Predicate<"Subtarget->hasEL2VMSA()">,
-                       AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">;
 def HasEL3           : Predicate<"Subtarget->hasEL3()">,
-                       AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">;
 def HasVH            : Predicate<"Subtarget->hasVH()">,
-                       AssemblerPredicateWithAll<(all_of FeatureVH), "vh">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureVH), "vh">;
 def HasLOR           : Predicate<"Subtarget->hasLOR()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">;
 def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
-
+                                 AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
 def HasPAuthLR       : Predicate<"Subtarget->hasPAuthLR()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
-
+                                 AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
 def HasJS            : Predicate<"Subtarget->hasJS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
 def HasCCIDX         : Predicate<"Subtarget->hasCCIDX()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">;
-
-def HasComplxNum      : Predicate<"Subtarget->hasComplxNum()">,
-                       AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">;
+def HasComplxNum     : Predicate<"Subtarget->hasComplxNum()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">;
 def HasNV            : Predicate<"Subtarget->hasNV()">,
-                       AssemblerPredicateWithAll<(all_of FeatureNV), "nv">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureNV), "nv">;
 def HasMPAM          : Predicate<"Subtarget->hasMPAM()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">;
 def HasDIT           : Predicate<"Subtarget->hasDIT()">,
-                       AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">;
-
-def HasTRACEV8_4         : Predicate<"Subtarget->hasTRACEV8_4()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">;
+def HasTRACEV8_4     : Predicate<"Subtarget->hasTRACEV8_4()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">;
 def HasAM            : Predicate<"Subtarget->hasAM()">,
-                       AssemblerPredicateWithAll<(all_of FeatureAM), "am">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureAM), "am">;
 def HasSEL2          : Predicate<"Subtarget->hasSEL2()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">;
-
-def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">;
+def HasTLB_RMI       : Predicate<"Subtarget->hasTLB_RMI()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">;
 def HasFlagM         : Predicate<"Subtarget->hasFlagM()">,
-                       AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">;
-
-def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPC_IMMO()">,
-                       AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">;
+def HasRCPC_IMMO     : Predicate<"Subtarget->hasRCPC_IMMO()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
-                               AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
+                                 AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->isNeonAvailable()">,
                                  AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
 def HasSM4           : Predicate<"Subtarget->hasSM4()">,
@@ -149,13 +130,13 @@ def HasSVE2          : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
 def HasSVE2p1        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">;
-def HasSVEAES       : Predicate<"Subtarget->hasSVEAES()">,
+def HasSVEAES        : Predicate<"Subtarget->hasSVEAES()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">;
-def HasSVESM4       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">,
+def HasSVESM4        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVESM4), "sve-sm4">;
-def HasSVESHA3      : Predicate<"Subtarget->hasSVESHA3()">,
+def HasSVESHA3       : Predicate<"Subtarget->hasSVESHA3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVESHA3), "sve-sha3">;
-def HasSVEBitPerm   : Predicate<"Subtarget->hasSVEBitPerm()">,
+def HasSVEBitPerm    : Predicate<"Subtarget->hasSVEBitPerm()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">;
 def HasSMEandIsNonStreamingSafe
                      : Predicate<"Subtarget->hasSME()">,
@@ -196,7 +177,7 @@ def HasSSVE_FP8DOT2  : Predicate<"Subtarget->hasSSVE_FP8DOT2() || "
                                  "(Subtarget->hasSVE2() && Subtarget->hasFP8DOT2())">,
                                  AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT2,
                                                            (all_of FeatureSVE2, FeatureFP8DOT2)),
-                                "ssve-fp8dot2 or (sve2 and fp8dot2)">;
+                                 "ssve-fp8dot2 or (sve2 and fp8dot2)">;
 def HasFP8DOT4       : Predicate<"Subtarget->hasFP8DOT4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP8DOT4), "fp8dot4">;
 def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
@@ -204,43 +185,60 @@ def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
                                  AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT4,
                                                            (all_of FeatureSVE2, FeatureFP8DOT4)),
                                  "ssve-fp8dot4 or (sve2 and fp8dot4)">;
-def HasLUT          : Predicate<"Subtarget->hasLUT()">,
+def HasLUT           : Predicate<"Subtarget->hasLUT()">,
                                  AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">;
-def HasSME_LUTv2    : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
+def HasSME_LUTv2     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">;
-def HasSMEF8F16     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,
+def HasSMEF8F16      : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">;
-def HasSMEF8F32     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
+def HasSMEF8F32      : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">;
-def HasSME_MOP4     : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">,
+def HasSME_MOP4      : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_MOP4), "sme-mop4">;
-def HasSME_TMOP     : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">,
+def HasSME_TMOP      : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_TMOP), "sme-tmop">;
-
-def HasCMPBR        : Predicate<"Subtarget->hasCMPBR()">,
+def HasCMPBR         : Predicate<"Subtarget->hasCMPBR()">,
                                  AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">;
-def HasF8F32MM      : Predicate<"Subtarget->hasF8F32MM()">,
+def HasF8F32MM       : Predicate<"Subtarget->hasF8F32MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">;
-def HasF8F16MM      : Predicate<"Subtarget->hasF8F16MM()">,
+def HasF8F16MM       : Predicate<"Subtarget->hasF8F16MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">;
-def HasFPRCVT       : Predicate<"Subtarget->hasFPRCVT()">,
+def HasFPRCVT        : Predicate<"Subtarget->hasFPRCVT()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">;
-def HasLSFE         : Predicate<"Subtarget->hasLSFE()">,
+def HasLSFE          : Predicate<"Subtarget->hasLSFE()">,
                                  AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">;
-def HasSME2p2       : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">,
+def HasSME2p2        : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">;
-def HasSVEAES2      : Predicate<"Subtarget->hasSVEAES2()">,
+def HasSVEAES2       : Predicate<"Subtarget->hasSVEAES2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">;
-def HasSVEBFSCALE   : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,
+def HasSVEBFSCALE    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">;
-def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,
+def HasSVE_F16F32MM  : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">;
 def HasPCDPHINT      : Predicate<"Subtarget->hasPCDPHINT()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">;
+                                 AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">;
 def HasLSUI          : Predicate<"Subtarget->hasLSUI()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">;
+                                 AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">;
 def HasOCCMO         : Predicate<"Subtarget->hasOCCMO()">,
-                       AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">;
+                                 AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">;
+def HasCMH           : Predicate<"Subtarget->hasCMH()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureCMH), "cmh">;
+def HasLSCP          : Predicate<"Subtarget->hasLSCP()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureLSCP), "lscp">;
+def HasSVE2p2        : Predicate<"Subtarget->hasSVE2p2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2p2), "sve2p2">;
+def HasSVE_B16MM     : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_B16MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE_B16MM), "sve-b16mm">;
+def HasF16MM         : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasF16MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">;
+def HasSVE2p3        : Predicate<"Subtarget->hasSVE2p3()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
+def HasSME2p3        : Predicate<"Subtarget->hasSME2p3()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">;
+def HasF16F32DOT     : Predicate<"Subtarget->hasF16F32DOT()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF16F32DOT), "f16f32dot">;
+def HasF16F32MM      : Predicate<"Subtarget->hasF16F32MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF16F32MM), "f16f32mm">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
@@ -310,6 +308,10 @@ def HasSVE2p2_or_SME2p2
     : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2),
                 "sme2p2 or sve2p2">;
+def HasSVE2p3_or_SME2p3
+    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p3() || Subtarget->hasSME2p3())">,
+                AssemblerPredicateWithAll<(any_of FeatureSME2p3, FeatureSVE2p3),
+                "sme2p3 or sve2p3">;
 def HasNonStreamingSVE2p2_or_SME2p2
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
@@ -328,100 +330,110 @@ def HasNEONandIsStreamingSafe
       AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 // A subset of NEON instructions are legal in Streaming SVE mode only with +sme2p2.
 def HasNEONandIsSME2p2StreamingSafe
-    : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">,
-    AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
+                     : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">,
+                                 AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
-                       AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">;
+                                 AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">;
 def HasFRInt3264     : Predicate<"Subtarget->hasFRInt3264()">,
-                       AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">;
+                                 AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">;
 def HasSB            : Predicate<"Subtarget->hasSB()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSB), "sb">;
-def HasPredRes      : Predicate<"Subtarget->hasPredRes()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSB), "sb">;
+def HasPredRes       : Predicate<"Subtarget->hasPredRes()">,
+                                 AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">;
 def HasCCDP          : Predicate<"Subtarget->hasCCDP()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">;
 def HasBTI           : Predicate<"Subtarget->hasBTI()">,
-                       AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">;
+                                 AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">;
 def HasMTE           : Predicate<"Subtarget->hasMTE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">;
 def HasTME           : Predicate<"Subtarget->hasTME()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTME), "tme">;
+                                 AssemblerPredicateWithAll<(all_of FeatureTME), "tme">;
 def HasETE           : Predicate<"Subtarget->hasETE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureETE), "ete">;
+                                 AssemblerPredicateWithAll<(all_of FeatureETE), "ete">;
 def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
+                                 AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
-                       AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
+                                 AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
 def HasNoBF16        : Predicate<"!Subtarget->hasBF16()">;
 def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
 def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
 def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
 def HasXS            : Predicate<"Subtarget->hasXS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
+                                 AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
 def HasWFxT          : Predicate<"Subtarget->hasWFxT()">,
-                       AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">;
+                                 AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">;
 def HasLS64          : Predicate<"Subtarget->hasLS64()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">;
+                                 AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">;
 def HasBRBE          : Predicate<"Subtarget->hasBRBE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">;
+                                 AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">;
 def HasSPE_EEF       : Predicate<"Subtarget->hasSPE_EEF()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">;
 def HasHBC           : Predicate<"Subtarget->hasHBC()">,
-                       AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">;
+                                 AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">;
 def HasMOPS          : Predicate<"Subtarget->hasMOPS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">;
 def HasCLRBHB        : Predicate<"Subtarget->hasCLRBHB()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">;
 def HasSPECRES2      : Predicate<"Subtarget->hasSPECRES2()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">;
 def HasITE           : Predicate<"Subtarget->hasITE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureITE), "ite">;
+                                 AssemblerPredicateWithAll<(all_of FeatureITE), "ite">;
 def HasTHE           : Predicate<"Subtarget->hasTHE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTHE), "the">;
+                                 AssemblerPredicateWithAll<(all_of FeatureTHE), "the">;
 def HasRCPC3         : Predicate<"Subtarget->hasRCPC3()">,
-                       AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">;
+                                 AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">;
 def HasLSE128        : Predicate<"Subtarget->hasLSE128()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">;
+                                 AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">;
 def HasD128          : Predicate<"Subtarget->hasD128()">,
-                       AssemblerPredicateWithAll<(all_of FeatureD128), "d128">;
+                                 AssemblerPredicateWithAll<(all_of FeatureD128), "d128">;
 def HasCHK           : Predicate<"Subtarget->hasCHK()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">;
 def HasGCS           : Predicate<"Subtarget->hasGCS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">;
+                                 AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">;
 def HasCPA           : Predicate<"Subtarget->hasCPA()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">;
+def HasTLBID         : Predicate<"Subtarget->hasTLBID()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureTLBID), "tlbid">;
+def HasMPAMv2        : Predicate<"Subtarget->hasMPAMv2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureMPAMv2), "mpamv2">;
+def HasMTETC         : Predicate<"Subtarget->hasMTETC()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureMTETC), "mtetc">;
+def HasGCIE          : Predicate<"Subtarget->hasGCIE()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureGCIE), "gcie">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
 def UseExperimentalZeroingPseudos
-    : Predicate<"Subtarget->useExperimentalZeroingPseudos()">;
+                     : Predicate<"Subtarget->useExperimentalZeroingPseudos()">;
 def UseAlternateSExtLoadCVTF32
-    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
+                     : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
 def UseNegativeImmediates
-    : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
-                                             "NegativeImmediates">;
+                     : Predicate<"false">,
+                                 AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
+                                 "NegativeImmediates">;
 
-def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
+def UseScalarIncVL   : Predicate<"Subtarget->useScalarIncVL()">;
 
 def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
 
-def HasFastIncVL : Predicate<"!Subtarget->hasDisableFastIncVL()">;
+def HasFastIncVL     : Predicate<"!Subtarget->hasDisableFastIncVL()">;
 
-def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
+def UseSVEFPLD1R     : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
-def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">;
+def UseLDAPUR        : Predicate<"!Subtarget->avoidLDAPUR()">;
 
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
 
-def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
+def AllowMisalignedMemAccesses
+                    : Predicate<"!Subtarget->requiresStrictAlign()">;
 
 def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
 
@@ -3692,6 +3704,12 @@ def UDF : UDFType<0, "udf">;
 // Load instructions.
 //===----------------------------------------------------------------------===//
 
+let Predicates = [HasLSCP] in {
+defm LDAP  : LoadAcquirePairOffset<0b0101, "ldap">;
+defm LDAPP : LoadAcquirePairOffset<0b0111, "ldapp">;
+defm STLP  : StoreAcquirePairOffset<0b0101, "stlp">;
+}
+
 // Pair (indexed, offset)
 defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;
 defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">;
@@ -11244,8 +11262,28 @@ let Predicates = [HasLSFE] in {
   def STBFMINNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b111, "stbfminnml">;
 }
 
+let Predicates = [HasF16F32DOT] in {
+  defm FDOT :SIMDThreeSameVectorFDot<"fdot">;
+  defm FDOTlane: SIMDThreeSameVectorFDOTIndex<"fdot">;
+}
+
+let Predicates = [HasF16MM] in
+  defm FMMLA : SIMDThreeSameVectorFMLA<"fmmla">;
+
+let Predicates = [HasF16F32MM] in
+  defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">;
+
 let Uses = [FPMR, FPCR] in
-defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+  defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+
+//===----------------------------------------------------------------------===//
+// Contention Management Hints (FEAT_CMH)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasCMH] in {
+  defm SHUH  : SHUH<"shuh">;       // Shared Update Hint instruction
+  def STCPH  : STCPHInst<"stcph">; // Store Concurrent Priority Hint instruction
+}
 
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 47144c7..cd94a25 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1341,6 +1341,10 @@ def Z_q  : RegisterOperand<ZPR,  "printTypedVectorList<0,'q'>"> {
   let ParserMatchClass = ZPRVectorList<128, 1>;
 }
 
+def ZZ_Any  : RegisterOperand<ZPR2, "printTypedVectorList<0,0>"> {
+  let ParserMatchClass = ZPRVectorList<0, 2>;
+}
+
 def ZZ_b  : RegisterOperand<ZPR2, "printTypedVectorList<0,'b'>"> {
   let ParserMatchClass = ZPRVectorList<8, 2>;
 }
@@ -1361,6 +1365,10 @@ def ZZ_q  : RegisterOperand<ZPR2, "printTypedVectorList<0,'q'>"> {
   let ParserMatchClass = ZPRVectorList<128, 2>;
 }
 
+def ZZZ_Any  : RegisterOperand<ZPR3, "printTypedVectorList<0,0>"> {
+  let ParserMatchClass = ZPRVectorList<0, 3>;
+}
+
 def ZZZ_b  : RegisterOperand<ZPR3, "printTypedVectorList<0,'b'>"> {
   let ParserMatchClass = ZPRVectorList<8, 3>;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index e552afe..752b185 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1173,3 +1173,14 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
   defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
   defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
 }
+
+//===----------------------------------------------------------------------===//
+// SME2.3 instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSME2p3] in {
+  def LUTI6_ZTZ       : sme2_lut_single<"luti6">;
+  def LUTI6_4ZT3Z     : sme2_luti6_zt_consecutive<"luti6">;
+  def LUTI6_S_4ZT3Z   : sme2_luti6_zt_strided<"luti6">;
+  def LUTI6_4Z2Z2ZI   : sme2_luti6_vector_vg4_consecutive<"luti6">;
+  def LUTI6_S_4Z2Z2ZI : sme2_luti6_vector_vg4_strided<"luti6">;
+} // [HasSME2p3]
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 98a128e..3b268dc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2569,7 +2569,7 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
 } // End HasBF16, HasSVE_or_SME
 
 let Predicates = [HasBF16, HasSVE] in {
-  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b01, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>;
+  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b011, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>;
 } // End HasBF16, HasSVE
 
 let Predicates = [HasBF16, HasSVE_or_SME] in {
@@ -3680,15 +3680,15 @@ let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {
 } // End HasSVE_or_SME, HasMatMulInt8
 
 let Predicates = [HasSVE, HasMatMulFP32] in {
-  defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b10, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>;
+  defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b101, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>;
 } // End HasSVE, HasMatMulFP32
 
 let Predicates = [HasSVE_F16F32MM] in {
-  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b00, "fmmla", ZPR32, ZPR16>;
+  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>;
 } // End HasSVE_F16F32MM
 
 let Predicates = [HasSVE, HasMatMulFP64] in {
-  defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b11, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>;
+  defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b111, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>;
   defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8,  nxv16i8, nxv16i1, AArch64ld1ro_z>;
   defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1,  AArch64ld1ro_z>;
   defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1,  AArch64ld1ro_z>;
@@ -4272,9 +4272,9 @@ def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$
 defm SQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>;
 defm UQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>;
 defm SQCVTUN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtun", 0b10, int_aarch64_sve_sqcvtun_x2>;
-defm SQRSHRN_Z2ZI_StoH  : sve2p1_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>;
-defm UQRSHRN_Z2ZI_StoH  : sve2p1_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>;
-defm SQRSHRUN_Z2ZI_StoH : sve2p1_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>;
+defm SQRSHRN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>;
+defm UQRSHRN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>;
+defm SQRSHRUN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>;
 
 defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>;
 defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>;
@@ -4615,6 +4615,75 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm REVD_ZPzZ : sve_int_perm_rev_revd_z<"revd", AArch64revd_mt>;
 } // End HasSME2p2orSVE2p2
 
+
+//===----------------------------------------------------------------------===//
+// SME2.3 or SVE2.3 instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE2p3_or_SME2p3] in {
+  // SVE2 Add pairwise within quadword vector segments (unpredicated)
+  defm ADDQP_ZZZ     : sve2_int_mul<0b110, "addqp",   null_frag>;
+
+  // SVE2 Add subtract/subtract pairwise
+  defm ADDSUBP_ZZZ   : sve2_int_mul<0b111, "addsubp", null_frag>;
+  defm SUBP_ZPmZZ    : sve2_int_arith_pred<0b100001, "subp", null_frag>;
+
+  // SVE2 integer absolute difference and accumulate long
+  defm SABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b0, "sabal">;
+  defm UABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b1, "uabal">;
+
+  // SVE2 integer dot product
+  def SDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b0, "sdot", ZPR16, ZPR8>;
+  def UDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b1, "udot", ZPR16, ZPR8>;
+
+  // SVE2 integer indexed dot product
+  def SDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b0, "sdot">;
+  def UDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b1, "udot">;
+
+  // SVE2 fp convert, narrow and interleave to integer, rounding toward zero
+  defm FCVTZSN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzsn", 0b0>;
+  defm FCVTZUN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzun", 0b1>;
+
+  // SVE2 signed/unsigned integer convert to floating-point
+  defm SCVTF_ZZ   : sve2_int_to_fp_upcvt<"scvtf",   0b00>;
+  defm SCVTFLT_ZZ : sve2_int_to_fp_upcvt<"scvtflt", 0b10>;
+  defm UCVTF_ZZ   : sve2_int_to_fp_upcvt<"ucvtf",   0b01>;
+  defm UCVTFLT_ZZ : sve2_int_to_fp_upcvt<"ucvtflt", 0b11>;
+
+  // SVE2 saturating shift right narrow by immediate and interleave
+  defm SQRSHRN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"sqrshrn",  0b101>;
+  defm SQRSHRUN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"sqrshrun", 0b001>;
+  defm SQSHRN_Z2ZI_HtoB   : sve_multi_vec_round_shift_narrow<"sqshrn",   0b000>;
+  defm SQSHRUN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"sqshrun",  0b100>;
+  defm UQRSHRN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"uqrshrn",  0b111>;
+  defm UQSHRN_Z2ZI_HtoB   : sve_multi_vec_round_shift_narrow<"uqshrn",   0b010>;
+  defm SQSHRUN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"sqshrun",  0b100, null_frag>;
+  defm SQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"sqshrn",   0b000, null_frag>;
+  defm UQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"uqshrn",   0b010, null_frag>;
+
+  defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
+} // End HasSME2p3orSVE2p3
+
+//===----------------------------------------------------------------------===//
+// SVE2.3 instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE2p3] in {
+  def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE_B16MM Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE_B16MM] in {
+  def BFMMLA_ZZZ_H : sve_fp_matrix_mla<0b110, "bfmmla", ZPR16, ZPR16>;
+}
+
+//===----------------------------------------------------------------------===//
+// F16MM Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE2p2, HasF16MM] in {
+  def FMMLA_ZZZ_H : sve_fp_matrix_mla<0b100, "fmmla", ZPR16, ZPR16>;
+}
+
 //===----------------------------------------------------------------------===//
 // SME2.2 or SVE2.2 instructions - Legal in streaming mode iff target has SME2p2
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 9438917..ae46d71 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -205,6 +205,7 @@ def lookupDCByName : SearchIndex {
   let Key = ["Name"];
 }
 
+//                Op1    CRn     CRm     Op2
 def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;
 def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;
 def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>;
@@ -241,6 +242,11 @@ def : DC<"CIGDVAC", 0b011, 0b0111, 0b1110, 0b101>;
 def : DC<"GZVA",    0b011, 0b0111, 0b0100, 0b100>;
 }
 
+let Requires = [{ {AArch64::FeatureMTETC} }] in {
+def : DC<"ZGBVA",   0b011, 0b0111, 0b0100, 0b101>;
+def : DC<"GBVA",    0b011, 0b0111, 0b0100, 0b111>;
+}
+
 let Requires = [{ {AArch64::FeatureMEC} }] in {
 def : DC<"CIPAE",   0b100, 0b0111, 0b1110, 0b000>;
 def : DC<"CIGDPAE", 0b100, 0b0111, 0b1110, 0b111>;
@@ -813,11 +819,26 @@ def : BTI<"j",  0b100>;
 def : BTI<"jc", 0b110>;
 
 //===----------------------------------------------------------------------===//
+// CMHPriority instruction options.
+//===----------------------------------------------------------------------===//
+
+class CMHPriorityHint<string name, bits<1> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<1> Encoding;
+  let Encoding = encoding;
+}
+
+def : CMHPriorityHint<"ph", 0b1>;
+
+//===----------------------------------------------------------------------===//
 // TLBI (translation lookaside buffer invalidate) instruction options.
 //===----------------------------------------------------------------------===//
 
 class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-                 bits<3> op2, bit needsreg> {
+                 bits<3> op2, bit needsreg, bit optionalreg> {
   string Name = name;
   bits<14> Encoding;
   let Encoding{13-11} = op1;
@@ -825,24 +846,25 @@ class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm,
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
   bit NeedsReg = needsreg;
+  bit OptionalReg = optionalreg;
   list<string> Requires = [];
   list<string> ExtraRequires = [];
   code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }];
 }
 
 class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-                bits<3> op2, bit needsreg>
-  : TLBICommon<name, op1, crn, crm, op2, needsreg>;
+                bits<3> op2, bit needsreg, bit optionalreg>
+  : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>;
 
 class TLBIPEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-                 bits<3> op2, bit needsreg>
-  : TLBICommon<name, op1, crn, crm, op2, needsreg>;
+                 bits<3> op2, bit needsreg, bit optionalreg>
+  : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>;
 
 multiclass TLBITableBase {
   def NAME # Table : GenericTable {
     let FilterClass = NAME # "Entry";
     let CppTypeName = NAME;
-    let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+    let Fields = ["Name", "Encoding", "NeedsReg", "OptionalReg", "RequiresStr"];
     let PrimaryKey = ["Encoding"];
     let PrimaryKeyName = "lookup" # NAME # "ByEncoding";
   }
@@ -856,60 +878,60 @@ defm TLBI  : TLBITableBase;
 defm TLBIP : TLBITableBase;
 
 multiclass TLBI<string name, bit hasTLBIP, bits<3> op1, bits<4> crn, bits<4> crm,
-             bits<3> op2, bit needsreg = 1> {
-  def : TLBIEntry<name, op1, crn, crm, op2, needsreg>;
-  def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
+             bits<3> op2, bit needsreg = 1, bit optionalreg = 0> {
+  def : TLBIEntry<name, op1, crn, crm, op2, needsreg, optionalreg>;
+  def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> {
     let Encoding{7} = 1;
     let ExtraRequires = ["AArch64::FeatureXS"];
   }
   if !eq(hasTLBIP, true) then {
-    def : TLBIPEntry<name, op1, crn, crm, op2, needsreg>;
-    def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
+    def : TLBIPEntry<name, op1, crn, crm, op2, needsreg, optionalreg>;
+    def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> {
       let Encoding{7} = 1;
       let ExtraRequires = ["AArch64::FeatureXS"];
     }
   }
 }
 
-//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg
 defm : TLBI<"IPAS2E1IS",    1, 0b100, 0b1000, 0b0000, 0b001>;
 defm : TLBI<"IPAS2LE1IS",   1, 0b100, 0b1000, 0b0000, 0b101>;
-defm : TLBI<"VMALLE1IS",    0, 0b000, 0b1000, 0b0011, 0b000, 0>;
-defm : TLBI<"ALLE2IS",      0, 0b100, 0b1000, 0b0011, 0b000, 0>;
-defm : TLBI<"ALLE3IS",      0, 0b110, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"VMALLE1IS",    0, 0b000, 0b1000, 0b0011, 0b000, 0, 1>;
+defm : TLBI<"ALLE2IS",      0, 0b100, 0b1000, 0b0011, 0b000, 0, 1>;
+defm : TLBI<"ALLE3IS",      0, 0b110, 0b1000, 0b0011, 0b000, 0, 1>;
 defm : TLBI<"VAE1IS",       1, 0b000, 0b1000, 0b0011, 0b001>;
 defm : TLBI<"VAE2IS",       1, 0b100, 0b1000, 0b0011, 0b001>;
 defm : TLBI<"VAE3IS",       1, 0b110, 0b1000, 0b0011, 0b001>;
 defm : TLBI<"ASIDE1IS",     0, 0b000, 0b1000, 0b0011, 0b010>;
 defm : TLBI<"VAAE1IS",      1, 0b000, 0b1000, 0b0011, 0b011>;
-defm : TLBI<"ALLE1IS",      0, 0b100, 0b1000, 0b0011, 0b100, 0>;
+defm : TLBI<"ALLE1IS",      0, 0b100, 0b1000, 0b0011, 0b100, 0, 1>;
 defm : TLBI<"VALE1IS",      1, 0b000, 0b1000, 0b0011, 0b101>;
 defm : TLBI<"VALE2IS",      1, 0b100, 0b1000, 0b0011, 0b101>;
 defm : TLBI<"VALE3IS",      1, 0b110, 0b1000, 0b0011, 0b101>;
-defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0>;
+defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0, 1>;
 defm : TLBI<"VAALE1IS",     1, 0b000, 0b1000, 0b0011, 0b111>;
 defm : TLBI<"IPAS2E1",      1, 0b100, 0b1000, 0b0100, 0b001>;
 defm : TLBI<"IPAS2LE1",     1, 0b100, 0b1000, 0b0100, 0b101>;
-defm : TLBI<"VMALLE1",      0, 0b000, 0b1000, 0b0111, 0b000, 0>;
-defm : TLBI<"ALLE2",        0, 0b100, 0b1000, 0b0111, 0b000, 0>;
-defm : TLBI<"ALLE3",        0, 0b110, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"VMALLE1",      0, 0b000, 0b1000, 0b0111, 0b000, 0, 0>;
+defm : TLBI<"ALLE2",        0, 0b100, 0b1000, 0b0111, 0b000, 0, 0>;
+defm : TLBI<"ALLE3",        0, 0b110, 0b1000, 0b0111, 0b000, 0, 0>;
 defm : TLBI<"VAE1",         1, 0b000, 0b1000, 0b0111, 0b001>;
 defm : TLBI<"VAE2",         1, 0b100, 0b1000, 0b0111, 0b001>;
 defm : TLBI<"VAE3",         1, 0b110, 0b1000, 0b0111, 0b001>;
 defm : TLBI<"ASIDE1",       0, 0b000, 0b1000, 0b0111, 0b010>;
 defm : TLBI<"VAAE1",        1, 0b000, 0b1000, 0b0111, 0b011>;
-defm : TLBI<"ALLE1",        0, 0b100, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"ALLE1",        0, 0b100, 0b1000, 0b0111, 0b100, 0, 0>;
 defm : TLBI<"VALE1",        1, 0b000, 0b1000, 0b0111, 0b101>;
 defm : TLBI<"VALE2",        1, 0b100, 0b1000, 0b0111, 0b101>;
 defm : TLBI<"VALE3",        1, 0b110, 0b1000, 0b0111, 0b101>;
-defm : TLBI<"VMALLS12E1",   0, 0b100, 0b1000, 0b0111, 0b110, 0>;
+defm : TLBI<"VMALLS12E1",   0, 0b100, 0b1000, 0b0111, 0b110, 0, 0>;
 defm : TLBI<"VAALE1",       1, 0b000, 0b1000, 0b0111, 0b111>;
 
 // Armv8.4-A Translation Lookaside Buffer Instructions (TLBI)
 let Requires = ["AArch64::FeatureTLB_RMI"] in {
 // Armv8.4-A Outer Sharable TLB Maintenance instructions:
-//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
-defm : TLBI<"VMALLE1OS",    0, 0b000, 0b1000, 0b0001, 0b000, 0>;
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg
+defm : TLBI<"VMALLE1OS",    0, 0b000, 0b1000, 0b0001, 0b000, 0, 1>;
 defm : TLBI<"VAE1OS",       1, 0b000, 0b1000, 0b0001, 0b001>;
 defm : TLBI<"ASIDE1OS",     0, 0b000, 0b1000, 0b0001, 0b010>;
 defm : TLBI<"VAAE1OS",      1, 0b000, 0b1000, 0b0001, 0b011>;
@@ -919,15 +941,15 @@ defm : TLBI<"IPAS2E1OS",    1, 0b100, 0b1000, 0b0100, 0b000>;
 defm : TLBI<"IPAS2LE1OS",   1, 0b100, 0b1000, 0b0100, 0b100>;
 defm : TLBI<"VAE2OS",       1, 0b100, 0b1000, 0b0001, 0b001>;
 defm : TLBI<"VALE2OS",      1, 0b100, 0b1000, 0b0001, 0b101>;
-defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0>;
+defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0, 1>;
 defm : TLBI<"VAE3OS",       1, 0b110, 0b1000, 0b0001, 0b001>;
 defm : TLBI<"VALE3OS",      1, 0b110, 0b1000, 0b0001, 0b101>;
-defm : TLBI<"ALLE2OS",      0, 0b100, 0b1000, 0b0001, 0b000, 0>;
-defm : TLBI<"ALLE1OS",      0, 0b100, 0b1000, 0b0001, 0b100, 0>;
-defm : TLBI<"ALLE3OS",      0, 0b110, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"ALLE2OS",      0, 0b100, 0b1000, 0b0001, 0b000, 0, 1>;
+defm : TLBI<"ALLE1OS",      0, 0b100, 0b1000, 0b0001, 0b100, 0, 1>;
+defm : TLBI<"ALLE3OS",      0, 0b110, 0b1000, 0b0001, 0b000, 0, 1>;
 
 // Armv8.4-A TLB Range Maintenance instructions:
-//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
+//                   hasTLBIP  op1    CRn     CRm     op2
 defm : TLBI<"RVAE1",        1, 0b000, 0b1000, 0b0110, 0b001>;
 defm : TLBI<"RVAAE1",       1, 0b000, 0b1000, 0b0110, 0b011>;
 defm : TLBI<"RVALE1",       1, 0b000, 0b1000, 0b0110, 0b101>;
@@ -962,18 +984,19 @@ defm : TLBI<"RVALE3OS",     1, 0b110, 0b1000, 0b0101, 0b101>;
 
 // Armv9-A Realm Management Extension TLBI Instructions
 let Requires = ["AArch64::FeatureRME"] in {
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
 defm : TLBI<"RPAOS",        0, 0b110, 0b1000, 0b0100, 0b011>;
 defm : TLBI<"RPALOS",       0, 0b110, 0b1000, 0b0100, 0b111>;
-defm : TLBI<"PAALLOS",      0, 0b110, 0b1000, 0b0001, 0b100, 0>;
-defm : TLBI<"PAALL",        0, 0b110, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"PAALLOS",      0, 0b110, 0b1000, 0b0001, 0b100, 0, 0>;
+defm : TLBI<"PAALL",        0, 0b110, 0b1000, 0b0111, 0b100, 0, 0>;
 }
 
 // Armv9.5-A TLBI VMALL for Dirty State
 let Requires = ["AArch64::FeatureTLBIW"] in {
-//                                           op1,   CRn,    CRm,    op2,   needsreg
-defm : TLBI<"VMALLWS2E1",    0, 0b100, 0b1000, 0b0110, 0b010, 0>;
-defm : TLBI<"VMALLWS2E1IS",  0, 0b100, 0b1000, 0b0010, 0b010, 0>;
-defm : TLBI<"VMALLWS2E1OS",  0, 0b100, 0b1000, 0b0101, 0b010, 0>;
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg
+defm : TLBI<"VMALLWS2E1",   0, 0b100, 0b1000, 0b0110, 0b010, 0, 0>;
+defm : TLBI<"VMALLWS2E1IS", 0, 0b100, 0b1000, 0b0010, 0b010, 0, 1>;
+defm : TLBI<"VMALLWS2E1OS", 0, 0b100, 0b1000, 0b0101, 0b010, 0, 1>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1862,13 +1885,6 @@ def : ROSysReg<"ERXPFGF_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b100>;
 
 // v8.4a MPAM registers
 //                             Op0   Op1    CRn     CRm     Op2
-let Requires = [{ {AArch64::FeatureMPAM} }] in {
-def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>;
-def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAM3_EL3",    0b11, 0b110, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAM1_EL12",   0b11, 0b101, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAMHCR_EL2",  0b11, 0b100, 0b1010, 0b0100, 0b000>;
 def : RWSysReg<"MPAMVPMV_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b001>;
 def : RWSysReg<"MPAMVPM0_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b000>;
 def : RWSysReg<"MPAMVPM1_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b001>;
@@ -1878,8 +1894,6 @@ def : RWSysReg<"MPAMVPM4_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b100>;
 def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>;
 def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>;
 def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>;
-def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>;
-} //FeatureMPAM
 
 // v8.4a Activity Monitor registers
 //                                 Op0   Op1    CRn     CRm     Op2
@@ -2319,6 +2333,26 @@ def : RWSysReg<"MPAMBW0_EL1",             0b11, 0b000, 0b1010, 0b0101, 0b101>;
 def : RWSysReg<"MPAMBWCAP_EL2",           0b11, 0b100, 0b1010, 0b0101, 0b110>;
 def : RWSysReg<"MPAMBWSM_EL1",            0b11, 0b000, 0b1010, 0b0101, 0b111>;
 
+// v9.7a Memory partitioning and monitoring version 2
+// (FEAT_MPAMv2) registers
+//                               Op0   Op1    CRn     CRm     Op2
+// MPAM system registers that are also available for MPAMv2
+def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>;
+def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM1_EL12",   0b11, 0b101, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM3_EL3",    0b11, 0b110, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAMHCR_EL2",  0b11, 0b100, 0b1010, 0b0100, 0b000>;
+def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>;
+// Only MPAMv2 registers
+def : RWSysReg<"MPAMCTL_EL1",   0b11, 0b000, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMCTL_EL12",  0b11, 0b101, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMCTL_EL2",   0b11, 0b100, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMCTL_EL3",   0b11, 0b110, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMVIDCR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b000>;
+def : RWSysReg<"MPAMVIDSR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b001>;
+def : RWSysReg<"MPAMVIDSR_EL3", 0b11, 0b110, 0b1010, 0b0111, 0b001>;
+
 //===----------------------------------------------------------------------===//
 // FEAT_SRMASK v9.6a registers
 //===----------------------------------------------------------------------===//
@@ -2412,3 +2446,251 @@ def : DC<"CIVAPS",    0b000, 0b0111, 0b1111, 0b001>;
 let Requires = [{ {AArch64::FeaturePoPS, AArch64::FeatureMTE} }] in {
 def : DC<"CIGDVAPS",  0b000, 0b0111, 0b1111, 0b101>;
 }
+
+// v9.7a TLBI domains system registers (MemSys)
+foreach n = 0-3 in {
+  defvar nb = !cast<bits<3>>(n);
+  def : RWSysReg<"VTLBID"#n#"_EL2", 0b11,  0b100, 0b0010, 0b1000, nb>;
+}
+
+foreach n = 0-3 in {
+  defvar nb = !cast<bits<3>>(n);
+  def : RWSysReg<"VTLBIDOS"#n#"_EL2", 0b11,  0b100, 0b0010, 0b1001, nb>;
+}
+
+def : ROSysReg<"TLBIDIDR_EL1",      0b11,  0b000, 0b1010, 0b0100, 0b110>;
+
+// MPAM Lookaside Buffer Invalidate (MLBI) instructions
+class MLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+  string RequiresStr = [{ {AArch64::FeatureMPAMv2} }];
+}
+
+def MLBITable : GenericTable {
+  let FilterClass = "MLBI";
+  let CppTypeName = "MLBI";
+  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupMLBIByEncoding";
+}
+
+def lookupMLBIByName : SearchIndex {
+  let Table = MLBITable;
+  let Key = ["Name"];
+}
+
+//                     Op1    CRn     CRm     Op2    needsReg
+def : MLBI<"ALLE1",    0b100, 0b0111, 0b0000, 0b100, 0>;
+def : MLBI<"VMALLE1",  0b100, 0b0111, 0b0000, 0b101, 0>;
+def : MLBI<"VPIDE1",   0b100, 0b0111, 0b0000, 0b110, 1>;
+def : MLBI<"VPMGE1",   0b100, 0b0111, 0b0000, 0b111, 1>;
+
+
+// v9.7-A GICv5 (FEAT_GCIE)
+// CPU Interface Registers
+//                                        Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICC_APR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"ICC_APR_EL3",             0b11, 0b110, 0b1100, 0b1000, 0b000>;
+def : RWSysReg<"ICC_CR0_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b001>;
+def : RWSysReg<"ICC_CR0_EL3",             0b11, 0b110, 0b1100, 0b1001, 0b000>;
+def : ROSysReg<"ICC_DOMHPPIR_EL3",        0b11, 0b110, 0b1100, 0b1000, 0b010>;
+def : ROSysReg<"ICC_HAPR_EL1",            0b11, 0b001, 0b1100, 0b0000, 0b011>;
+def : ROSysReg<"ICC_HPPIR_EL1",           0b11, 0b000, 0b1100, 0b1010, 0b011>;
+def : ROSysReg<"ICC_HPPIR_EL3",           0b11, 0b110, 0b1100, 0b1001, 0b001>;
+def : ROSysReg<"ICC_IAFFIDR_EL1",         0b11, 0b000, 0b1100, 0b1010, 0b101>;
+def : RWSysReg<"ICC_ICSR_EL1",            0b11, 0b000, 0b1100, 0b1010, 0b100>;
+def : ROSysReg<"ICC_IDR0_EL1",            0b11, 0b000, 0b1100, 0b1010, 0b010>;
+def : RWSysReg<"ICC_PCR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"ICC_PCR_EL3",             0b11, 0b110, 0b1100, 0b1000, 0b001>;
+
+// Virtual CPU Interface Registers
+//                                        Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICV_APR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"ICV_CR0_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b001>;
+def : RWSysReg<"ICV_HAPR_EL1",            0b11, 0b001, 0b1100, 0b0000, 0b011>;
+def : RWSysReg<"ICV_HPPIR_EL1",           0b11, 0b000, 0b1100, 0b1010, 0b011>;
+def : RWSysReg<"ICV_PCR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b010>;
+
+foreach n=0-3 in {
+  defvar nb = !cast<bits<2>>(n);
+//                                             Op0   Op1    CRn     CRm     Op2
+  def : RWSysReg<"ICC_PPI_DOMAINR"#n#"_EL3",   0b11, 0b110, 0b1100, 0b1000, {0b1,nb{1-0}}>;
+
+}
+
+foreach n=0-15 in{
+  defvar nb = !cast<bits<4>>(n);
+//                                               Op0   Op1    CRn     CRm            Op2
+  def : RWSysReg<"ICC_PPI_PRIORITYR"#n#"_EL1",   0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>;
+}
+
+// PPI and Virtual PPI Registers
+multiclass PPIRegisters<string prefix> {
+  foreach n=0-1 in {
+    defvar nb = !cast<bit>(n);
+//                                                  Op0   Op1    CRn     CRm     Op2
+    def : RWSysReg<prefix#"_PPI_CACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b00,nb}>;
+    def : RWSysReg<prefix#"_PPI_CPENDR"#n#"_EL1",   0b11, 0b000, 0b1100, 0b1101, {0b10,nb}>;
+    def : RWSysReg<prefix#"_PPI_ENABLER"#n#"_EL1",  0b11, 0b000, 0b1100, 0b1010, {0b11,nb}>;
+    def : RWSysReg<prefix#"_PPI_SACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b01,nb}>;
+    def : RWSysReg<prefix#"_PPI_SPENDR"#n#"_EL1",   0b11, 0b000, 0b1100, 0b1101, {0b11,nb}>;
+    def : RWSysReg<prefix#"_PPI_HMR"#n#"_EL1",      0b11, 0b000, 0b1100, 0b1010, {0b00,nb}>;
+  }
+}
+
+defm : PPIRegisters<"ICC">;  // PPI Registers
+defm : PPIRegisters<"ICV">;  // Virtual PPI Registers
+
+foreach n=0-15 in {
+  defvar nb = !cast<bits<4>>(n);
+//                                               Op0   Op1    CRn     CRm            Op2
+  def : RWSysReg<"ICV_PPI_PRIORITYR"#n#"_EL1",   0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>;
+}
+
+// Hypervisor Control Registers
+//                                    Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICH_APR_EL2",         0b11, 0b100, 0b1100, 0b1000, 0b100>;
+def : RWSysReg<"ICH_CONTEXTR_EL2",    0b11, 0b100, 0b1100, 0b1011, 0b110>;
+def : RWSysReg<"ICH_HFGITR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b111>;
+def : RWSysReg<"ICH_HFGRTR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b100>;
+def : RWSysReg<"ICH_HFGWTR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b110>;
+def : ROSysReg<"ICH_HPPIR_EL2",       0b11, 0b100, 0b1100, 0b1000, 0b101>;
+def : RWSysReg<"ICH_VCTLR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b100>;
+
+foreach n=0-1 in {
+  defvar nb = !cast<bit>(n);
+//                                           Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICH_PPI_ACTIVER"#n#"_EL2",   0b11, 0b100, 0b1100, 0b1010, {0b11,nb}>;
+def : RWSysReg<"ICH_PPI_DVIR"#n#"_EL2",      0b11, 0b100, 0b1100, 0b1010, {0b00,nb}>;
+def : RWSysReg<"ICH_PPI_ENABLER"#n#"_EL2",   0b11, 0b100, 0b1100, 0b1010, {0b01,nb}>;
+def : RWSysReg<"ICH_PPI_PENDR"#n#"_EL2",     0b11, 0b100, 0b1100, 0b1010, {0b10,nb}>;
+}
+
+foreach n=0-15 in {
+  defvar nb = !cast<bits<4>>(n);
+//                                               Op0   Op1    CRn     CRm            Op2
+  def : RWSysReg<"ICH_PPI_PRIORITYR"#n#"_EL2",   0b11, 0b100, 0b1100, {0b111,nb{3}}, nb{2-0}>;
+}
+
+//===----------------------------------------------------------------------===//
+// GICv5 instruction options.
+//===----------------------------------------------------------------------===//
+
+// GIC
+class GIC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = 1;
+  string RequiresStr = [{ {AArch64::FeatureGCIE} }];
+}
+
+// GSB
+class GSB<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  string RequiresStr = [{ {AArch64::FeatureGCIE} }];
+}
+
+// GICR
+class GICR<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = 1;
+  string RequiresStr = [{ {AArch64::FeatureGCIE} }];
+}
+
+def GICTable : GenericTable {
+  let FilterClass = "GIC";
+  let CppTypeName = "GIC";
+  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupGICByEncoding";
+}
+
+def GSBTable : GenericTable {
+  let FilterClass = "GSB";
+  let CppTypeName = "GSB";
+  let Fields = ["Name", "Encoding", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupGSBByEncoding";
+}
+
+def GICRTable : GenericTable {
+  let FilterClass = "GICR";
+  let CppTypeName = "GICR";
+  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupGICRByEncoding";
+}
+
+def lookupGICByName : SearchIndex {
+  let Table = GICTable;
+  let Key = ["Name"];
+}
+
+def lookupGSBByName : SearchIndex {
+  let Table = GSBTable;
+  let Key = ["Name"];
+}
+
+def lookupGICRByName : SearchIndex {
+  let Table = GICRTable;
+  let Key = ["Name"];
+}
+
+//                    Op1    CRn     CRm     Op2
+def : GSB<"sys",      0b000, 0b1100, 0b0000, 0b000>;
+def : GSB<"ack",      0b000, 0b1100, 0b0000, 0b001>;
+
+//                    Op1    CRn     CRm     Op2
+def : GICR<"cdia",    0b000, 0b1100, 0b0011, 0b000>;
+def : GICR<"cdnmia",  0b000, 0b1100, 0b0011, 0b001>;
+
+//                    Op1    CRn     CRm     Op2
+def : GIC<"cdaff",    0b000, 0b1100, 0b0001, 0b011>;
+def : GIC<"cddi",     0b000, 0b1100, 0b0010, 0b000>;
+def : GIC<"cddis",    0b000, 0b1100, 0b0001, 0b000>;
+def : GIC<"cden",     0b000, 0b1100, 0b0001, 0b001>;
+def : GIC<"cdeoi",    0b000, 0b1100, 0b0001, 0b111>;
+def : GIC<"cdhm",     0b000, 0b1100, 0b0010, 0b001>;
+def : GIC<"cdpend",   0b000, 0b1100, 0b0001, 0b100>;
+def : GIC<"cdpri",    0b000, 0b1100, 0b0001, 0b010>;
+def : GIC<"cdrcfg",   0b000, 0b1100, 0b0001, 0b101>;
+def : GIC<"vdaff",    0b100, 0b1100, 0b0001, 0b011>;
+def : GIC<"vddi",     0b100, 0b1100, 0b0010, 0b000>;
+def : GIC<"vddis",    0b100, 0b1100, 0b0001, 0b000>;
+def : GIC<"vden",     0b100, 0b1100, 0b0001, 0b001>;
+def : GIC<"vdhm",     0b100, 0b1100, 0b0010, 0b001>;
+def : GIC<"vdpend",   0b100, 0b1100, 0b0001, 0b100>;
+def : GIC<"vdpri",    0b100, 0b1100, 0b0001, 0b010>;
+def : GIC<"vdrcfg",   0b100, 0b1100, 0b0001, 0b101>;
+def : GIC<"ldaff",    0b110, 0b1100, 0b0001, 0b011>;
+def : GIC<"lddi",     0b110, 0b1100, 0b0010, 0b000>;
+def : GIC<"lddis",    0b110, 0b1100, 0b0001, 0b000>;
+def : GIC<"lden",     0b110, 0b1100, 0b0001, 0b001>;
+def : GIC<"ldhm",     0b110, 0b1100, 0b0010, 0b001>;
+def : GIC<"ldpend",   0b110, 0b1100, 0b0001, 0b100>;
+def : GIC<"ldpri",    0b110, 0b1100, 0b0001, 0b010>;
+def : GIC<"ldrcfg",   0b110, 0b1100, 0b0001, 0b101>;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 636d4f8a..6273cfc 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -159,6 +159,7 @@ private:
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  bool parseSyslAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   bool parseSyspAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond,
@@ -266,6 +267,7 @@ private:
   ParseStatus tryParseRPRFMOperand(OperandVector &Operands);
   ParseStatus tryParsePSBHint(OperandVector &Operands);
   ParseStatus tryParseBTIHint(OperandVector &Operands);
+  ParseStatus tryParseCMHPriorityHint(OperandVector &Operands);
   ParseStatus tryParseAdrpLabel(OperandVector &Operands);
   ParseStatus tryParseAdrLabel(OperandVector &Operands);
   template <bool AddFPZeroAsLiteral>
@@ -370,6 +372,7 @@ private:
     k_PSBHint,
     k_PHint,
     k_BTIHint,
+    k_CMHPriorityHint,
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -499,6 +502,11 @@ private:
     unsigned Length;
     unsigned Val;
   };
+  struct CMHPriorityHintOp {
+    const char *Data;
+    unsigned Length;
+    unsigned Val;
+  };
 
   struct SVCROp {
     const char *Data;
@@ -525,6 +533,7 @@ private:
     struct PSBHintOp PSBHint;
     struct PHintOp PHint;
     struct BTIHintOp BTIHint;
+    struct CMHPriorityHintOp CMHPriorityHint;
     struct ShiftExtendOp ShiftExtend;
     struct SVCROp SVCR;
   };
@@ -595,6 +604,9 @@ public:
     case k_BTIHint:
       BTIHint = o.BTIHint;
       break;
+    case k_CMHPriorityHint:
+      CMHPriorityHint = o.CMHPriorityHint;
+      break;
     case k_ShiftExtend:
       ShiftExtend = o.ShiftExtend;
       break;
@@ -769,6 +781,16 @@ public:
     return StringRef(BTIHint.Data, BTIHint.Length);
   }
 
+  unsigned getCMHPriorityHint() const {
+    assert(Kind == k_CMHPriorityHint && "Invalid access!");
+    return CMHPriorityHint.Val;
+  }
+
+  StringRef getCMHPriorityHintName() const {
+    assert(Kind == k_CMHPriorityHint && "Invalid access!");
+    return StringRef(CMHPriorityHint.Data, CMHPriorityHint.Length);
+  }
+
   StringRef getSVCR() const {
     assert(Kind == k_SVCR && "Invalid access!");
     return StringRef(SVCR.Data, SVCR.Length);
@@ -1511,6 +1533,7 @@ public:
   bool isPSBHint() const { return Kind == k_PSBHint; }
   bool isPHint() const { return Kind == k_PHint; }
   bool isBTIHint() const { return Kind == k_BTIHint; }
+  bool isCMHPriorityHint() const { return Kind == k_CMHPriorityHint; }
   bool isShiftExtend() const { return Kind == k_ShiftExtend; }
   bool isShifter() const {
     if (!isShiftExtend())
@@ -2196,6 +2219,11 @@ public:
     Inst.addOperand(MCOperand::createImm(getBTIHint()));
   }
 
+  void addCMHPriorityHintOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getCMHPriorityHint()));
+  }
+
   void addShifterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     unsigned Imm =
@@ -2547,6 +2575,17 @@ public:
   }
 
   static std::unique_ptr<AArch64Operand>
+  CreateCMHPriorityHint(unsigned Val, StringRef Str, SMLoc S, MCContext &Ctx) {
+    auto Op = std::make_unique<AArch64Operand>(k_CMHPriorityHint, Ctx);
+    Op->CMHPriorityHint.Val = Val;
+    Op->CMHPriorityHint.Data = Str.data();
+    Op->CMHPriorityHint.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
   CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind,
                        SMLoc S, SMLoc E, MCContext &Ctx) {
     auto Op = std::make_unique<AArch64Operand>(k_MatrixRegister, Ctx);
@@ -2656,6 +2695,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {
   case k_BTIHint:
     OS << getBTIHintName();
     break;
+  case k_CMHPriorityHint:
+    OS << getCMHPriorityHintName();
+    break;
   case k_MatrixRegister:
     OS << "<matrix " << getMatrixReg() << ">";
     break;
@@ -3279,6 +3321,24 @@ ParseStatus AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
+/// tryParseCMHPriorityHint - Try to parse a CMHPriority operand
+ParseStatus AArch64AsmParser::tryParseCMHPriorityHint(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return TokError("invalid operand for instruction");
+
+  auto CMHPriority =
+      AArch64CMHPriorityHint::lookupCMHPriorityHintByName(Tok.getString());
+  if (!CMHPriority)
+    return TokError("invalid operand for instruction");
+
+  Operands.push_back(AArch64Operand::CreateCMHPriorityHint(
+      CMHPriority->Encoding, Tok.getString(), S, getContext()));
+  Lex(); // Eat identifier token.
+  return ParseStatus::Success;
+}
+
 /// tryParseAdrpLabel - Parse and validate a source label for the ADRP
 /// instruction.
 ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
@@ -3824,6 +3884,18 @@ static const struct Extension {
     {"ssve-bitperm", {AArch64::FeatureSSVE_BitPerm}},
     {"sme-mop4", {AArch64::FeatureSME_MOP4}},
     {"sme-tmop", {AArch64::FeatureSME_TMOP}},
+    {"cmh", {AArch64::FeatureCMH}},
+    {"lscp", {AArch64::FeatureLSCP}},
+    {"tlbid", {AArch64::FeatureTLBID}},
+    {"mpamv2", {AArch64::FeatureMPAMv2}},
+    {"mtetc", {AArch64::FeatureMTETC}},
+    {"gcie", {AArch64::FeatureGCIE}},
+    {"sme2p3", {AArch64::FeatureSME2p3}},
+    {"sve2p3", {AArch64::FeatureSVE2p3}},
+    {"sve-b16mm", {AArch64::FeatureSVE_B16MM}},
+    {"f16mm", {AArch64::FeatureF16MM}},
+    {"f16f32dot", {AArch64::FeatureF16F32DOT}},
+    {"f16f32mm", {AArch64::FeatureF16F32MM}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
@@ -3861,6 +3933,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
     Str += "ARMv9.5a";
   else if (FBS[AArch64::HasV9_6aOps])
     Str += "ARMv9.6a";
+  else if (FBS[AArch64::HasV9_7aOps])
+    Str += "ARMv9.7a";
   else if (FBS[AArch64::HasV8_0rOps])
     Str += "ARMv8r";
   else {
@@ -3894,8 +3968,9 @@ void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands
       AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
 }
 
-/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
-/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+/// parseSysAlias - The IC, DC, AT, TLBI, MLBI and GIC{R} and GSB instructions
+/// are simple aliases for the SYS instruction. Parse them specially so that
+/// we create a SYS MCInst.
 bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
                                    OperandVector &Operands) {
   if (Name.contains('.'))
@@ -3908,6 +3983,8 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
   bool ExpectRegister = true;
+  bool OptionalRegister = false;
+  bool hasAll = getSTI().hasFeature(AArch64::FeatureAll);
 
   if (Mnemonic == "ic") {
     const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
@@ -3950,13 +4027,50 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
       return TokError(Str);
     }
     ExpectRegister = TLBI->NeedsReg;
+    bool hasTLBID = getSTI().hasFeature(AArch64::FeatureTLBID);
+    if (hasAll || hasTLBID) {
+      OptionalRegister = TLBI->OptionalReg;
+    }
     createSysAlias(TLBI->Encoding, Operands, S);
-  } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" || Mnemonic == "cosp") {
+  } else if (Mnemonic == "mlbi") {
+    const AArch64MLBI::MLBI *MLBI = AArch64MLBI::lookupMLBIByName(Op);
+    if (!MLBI)
+      return TokError("invalid operand for MLBI instruction");
+    else if (!MLBI->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("MLBI " + std::string(MLBI->Name) + " requires: ");
+      setRequiredFeatureString(MLBI->getRequiredFeatures(), Str);
+      return TokError(Str);
+    }
+    ExpectRegister = MLBI->NeedsReg;
+    createSysAlias(MLBI->Encoding, Operands, S);
+  } else if (Mnemonic == "gic") {
+    const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByName(Op);
+    if (!GIC)
+      return TokError("invalid operand for GIC instruction");
+    else if (!GIC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("GIC " + std::string(GIC->Name) + " requires: ");
+      setRequiredFeatureString(GIC->getRequiredFeatures(), Str);
+      return TokError(Str);
+    }
+    ExpectRegister = true;
+    createSysAlias(GIC->Encoding, Operands, S);
+  } else if (Mnemonic == "gsb") {
+    const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByName(Op);
+    if (!GSB)
+      return TokError("invalid operand for GSB instruction");
+    else if (!GSB->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("GSB " + std::string(GSB->Name) + " requires: ");
+      setRequiredFeatureString(GSB->getRequiredFeatures(), Str);
+      return TokError(Str);
+    }
+    ExpectRegister = false;
+    createSysAlias(GSB->Encoding, Operands, S);
+  } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" ||
+             Mnemonic == "cosp") {
 
     if (Op.lower() != "rctx")
       return TokError("invalid operand for prediction restriction instruction");
 
-    bool hasAll = getSTI().hasFeature(AArch64::FeatureAll);
     bool hasPredres = hasAll || getSTI().hasFeature(AArch64::FeaturePredRes);
     bool hasSpecres2 = hasAll || getSTI().hasFeature(AArch64::FeatureSPECRES2);
 
@@ -3989,10 +4103,61 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     HasRegister = true;
   }
 
-  if (ExpectRegister && !HasRegister)
-    return TokError("specified " + Mnemonic + " op requires a register");
-  else if (!ExpectRegister && HasRegister)
-    return TokError("specified " + Mnemonic + " op does not use a register");
+  if (!OptionalRegister) {
+    if (ExpectRegister && !HasRegister)
+      return TokError("specified " + Mnemonic + " op requires a register");
+    else if (!ExpectRegister && HasRegister)
+      return TokError("specified " + Mnemonic + " op does not use a register");
+  }
+
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+    return true;
+
+  return false;
+}
+
+/// parseSyslAlias - The GICR instructions are simple aliases for
+/// the SYSL instruction. Parse them specially so that we create a
+/// SYS MCInst.
+bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
+
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sysl", NameLoc, getContext()));
+
+  // Now expect two operands (identifier + register)
+  SMLoc startLoc = getLoc();
+  const AsmToken &regTok = getTok();
+  StringRef reg = regTok.getString();
+  unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar);
+  if (!RegNum)
+    return TokError("expected register operand");
+
+  Operands.push_back(AArch64Operand::CreateReg(
+      RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg));
+
+  Lex(); // Eat token
+  if (parseToken(AsmToken::Comma))
+    return true;
+
+  // Check for identifier
+  const AsmToken &operandTok = getTok();
+  StringRef Op = operandTok.getString();
+  SMLoc S2 = operandTok.getLoc();
+  Lex(); // Eat token
+
+  if (Mnemonic == "gicr") {
+    const AArch64GICR::GICR *GICR = AArch64GICR::lookupGICRByName(Op);
+    if (!GICR)
+      return Error(S2, "invalid operand for GICR instruction");
+    else if (!GICR->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("GICR " + std::string(GICR->Name) + " requires: ");
+      setRequiredFeatureString(GICR->getRequiredFeatures(), Str);
+      return Error(S2, Str);
+    }
+    createSysAlias(GICR->Encoding, Operands, S2);
+  }
 
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
     return true;
@@ -4025,7 +4190,7 @@ bool AArch64AsmParser::parseSyspAlias(StringRef Name, SMLoc NameLoc,
       return TokError("invalid operand for TLBIP instruction");
     const AArch64TLBIP::TLBIP TLBIP(
         TLBIPorig->Name, TLBIPorig->Encoding | (HasnXSQualifier ? (1 << 7) : 0),
-        TLBIPorig->NeedsReg,
+        TLBIPorig->NeedsReg, TLBIPorig->OptionalReg,
         HasnXSQualifier
             ? TLBIPorig->FeaturesRequired | FeatureBitset({AArch64::FeatureXS})
             : TLBIPorig->FeaturesRequired);
@@ -4719,6 +4884,13 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
       FirstReg, Count, Stride, NumElements, ElementWidth, VectorKind, S,
       getLoc(), getContext()));
 
+  if (getTok().is(AsmToken::LBrac)) {
+    ParseStatus Res = tryParseVectorIndex(Operands);
+    if (Res.isFailure())
+      return ParseStatus::Failure;
+    return ParseStatus::Success;
+  }
+
   return ParseStatus::Success;
 }
 
@@ -5267,12 +5439,17 @@ bool AArch64AsmParser::parseInstruction(ParseInstructionInfo &Info,
   size_t Start = 0, Next = Name.find('.');
   StringRef Head = Name.slice(Start, Next);
 
-  // IC, DC, AT, TLBI and Prediction invalidation instructions are aliases for
-  // the SYS instruction.
+  // IC, DC, AT, TLBI, MLBI, GIC{R}, GSB and Prediction invalidation
+  // instructions are aliases for the SYS instruction.
   if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi" ||
-      Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp")
+      Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp" ||
+      Head == "mlbi" || Head == "gic" || Head == "gsb")
     return parseSysAlias(Head, NameLoc, Operands);
 
+  // GICR instructions are aliases for the SYSL instruction.
+  if (Head == "gicr")
+    return parseSyslAlias(Head, NameLoc, Operands);
+
   // TLBIP instructions are aliases for the SYSP instruction.
   if (Head == "tlbip")
     return parseSyspAlias(Head, NameLoc, Operands);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 35bd244..5c3e26e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -84,6 +84,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
       return;
     }
 
+  if (Opcode == AArch64::SYSLxt)
+    if (printSyslAlias(MI, STI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
   if (Opcode == AArch64::SYSPxt || Opcode == AArch64::SYSPxt_XZR)
     if (printSyspAlias(MI, STI, O)) {
       printAnnotation(O, Annot);
@@ -909,13 +915,25 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   Encoding |= CnVal << 7;
   Encoding |= Op1Val << 11;
 
-  bool NeedsReg;
+  bool NeedsReg = false;
+  bool OptionalReg = false;
   std::string Ins;
   std::string Name;
 
   if (CnVal == 7) {
     switch (CmVal) {
     default: return false;
+    // MLBI aliases
+    case 0: {
+      const AArch64MLBI::MLBI *MLBI =
+          AArch64MLBI::lookupMLBIByEncoding(Encoding);
+      if (!MLBI || !MLBI->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = MLBI->NeedsReg;
+      Ins = "mlbi\t";
+      Name = std::string(MLBI->Name);
+    } break;
     // Maybe IC, maybe Prediction Restriction
     case 1:
       switch (Op1Val) {
@@ -1004,19 +1022,41 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
       return false;
 
     NeedsReg = TLBI->NeedsReg;
+    if (STI.hasFeature(AArch64::FeatureAll) ||
+        STI.hasFeature(AArch64::FeatureTLBID))
+      OptionalReg = TLBI->OptionalReg;
     Ins = "tlbi\t";
     Name = std::string(TLBI->Name);
-  }
-  else
+  } else if (CnVal == 12) {
+    if (CmVal != 0) {
+      // GIC aliases
+      const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByEncoding(Encoding);
+      if (!GIC || !GIC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "gic\t";
+      Name = std::string(GIC->Name);
+    } else {
+      // GSB aliases
+      const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByEncoding(Encoding);
+      if (!GSB || !GSB->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = false;
+      Ins = "gsb\t";
+      Name = std::string(GSB->Name);
+    }
+  } else
     return false;
 
   StringRef Reg = getRegisterName(MI->getOperand(4).getReg());
   bool NotXZR = Reg != "xzr";
 
-  // If a mandatory is not specified in the TableGen
+  // If a mandatory or optional register is not specified in the TableGen
   // (i.e. no register operand should be present), and the register value
   // is not xzr/x31, then disassemble to a SYS alias instead.
-  if (NotXZR && !NeedsReg)
+  if (NotXZR && !NeedsReg && !OptionalReg)
     return false;
 
   std::string Str = Ins + Name;
@@ -1024,12 +1064,64 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
 
   O << '\t' << Str;
 
-  if (NeedsReg)
+  // For optional registers, don't print the value if it's xzr/x31
+  // since this defaults to xzr/x31 if register is not specified.
+  if (NeedsReg || (OptionalReg && NotXZR))
     O << ", " << Reg;
 
   return true;
 }
 
+bool AArch64InstPrinter::printSyslAlias(const MCInst *MI,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSLxt && "Invalid opcode for SYSL alias!");
+#endif
+
+  StringRef Reg = getRegisterName(MI->getOperand(0).getReg());
+  const MCOperand &Op1 = MI->getOperand(1);
+  const MCOperand &Cn = MI->getOperand(2);
+  const MCOperand &Cm = MI->getOperand(3);
+  const MCOperand &Op2 = MI->getOperand(4);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  uint16_t Encoding = Op2Val;
+  Encoding |= CmVal << 3;
+  Encoding |= CnVal << 7;
+  Encoding |= Op1Val << 11;
+
+  std::string Ins;
+  std::string Name;
+
+  if (CnVal == 12) {
+    if (CmVal == 3) {
+      // GICR aliases
+      const AArch64GICR::GICR *GICR =
+          AArch64GICR::lookupGICRByEncoding(Encoding);
+      if (!GICR || !GICR->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      Ins = "gicr";
+      Name = std::string(GICR->Name);
+    } else
+      return false;
+  } else
+    return false;
+
+  std::string Str;
+  llvm::transform(Name, Name.begin(), ::tolower);
+
+  O << '\t' << Ins << '\t' << Reg.str() << ", " << Name;
+
+  return true;
+}
+
 bool AArch64InstPrinter::printSyspAlias(const MCInst *MI,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -1508,6 +1600,17 @@ void AArch64InstPrinter::printBTIHintOp(const MCInst *MI, unsigned OpNum,
     markup(O, Markup::Immediate) << '#' << formatImm(btihintop);
 }
 
+void AArch64InstPrinter::printCMHPriorityHintOp(const MCInst *MI,
+                                                unsigned OpNum,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  unsigned priorityhint_op = MI->getOperand(OpNum).getImm();
+  auto PHint =
+      AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(priorityhint_op);
+  if (PHint)
+    O << PHint->Name;
+}
+
 void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 15ef2dd..307402d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -52,6 +52,8 @@ public:
 protected:
   bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,
                      raw_ostream &O);
+  bool printSyslAlias(const MCInst *MI, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
   bool printSyspAlias(const MCInst *MI, const MCSubtargetInfo &STI,
                       raw_ostream &O);
   bool printRangePrefetchAlias(const MCInst *MI, const MCSubtargetInfo &STI,
@@ -151,6 +153,9 @@ protected:
   void printBTIHintOp(const MCInst *MI, unsigned OpNum,
                       const MCSubtargetInfo &STI, raw_ostream &O);
 
+  void printCMHPriorityHintOp(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+
   void printFPImmOperand(const MCInst *MI, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 33f35ad..99836ae 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3920,6 +3920,78 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> {
   def _S : sme2_luti4_vector_vg4_index<0b10, ZZZZ_s_mul_r, mnemonic>;
 }
 
+// 8-bit Look up table
+class sme2_lut_single<string asm>
+  : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
+    asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
+  bits<0> ZTt;
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-10} = 0b1100000011001000010000;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+//===----------------------------------------------------------------------===//
+// Lookup table read with 6-bit indices (8-bit)
+class sme2_luti6_zt_base<RegisterOperand zd_ty, string asm>
+  : I<(outs zd_ty:$Zd), (ins ZTR:$ZTt, ZZZ_Any:$Zn),
+    asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
+  bits<0> ZTt;
+  bits<3> Zd;
+  bits<3> Zn;
+  let Inst{31-21} = 0b11000000100;
+  let Inst{19-10} = 0b1010000000;
+  let Inst{9-7}   = Zn;
+  let Inst{6-5}   = 0b00;
+}
+
+class sme2_luti6_zt_consecutive<string asm>
+  : sme2_luti6_zt_base<ZZZZ_b_mul_r, asm> {
+  let Inst{20}    = 0;
+  let Inst{4-2}   = Zd;
+  let Inst{1-0}   = 0b00;
+}
+
+class sme2_luti6_zt_strided<string asm>
+  : sme2_luti6_zt_base<ZZZZ_b_strided, asm> {
+  let Inst{20}    = 1;
+  let Inst{4}     = Zd{2};
+  let Inst{3-2}   = 0b00;
+  let Inst{1-0}   = Zd{1-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Lookup table read with 6-bit indices (8-bit)
+class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty, string asm>
+  : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, ZZ_Any:$Zm, VectorIndexD:$i1),
+    asm, "\t$Zd, $Zn, $Zm$i1", "", []>, Sched<[]> {
+  bits<3> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  bits<1> i1;
+  let Inst{31-23} = 0b110000010;
+  let Inst{22}    = i1;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{9-5}   = Zn;
+}
+
+class sme2_luti6_vector_vg4_consecutive<string asm>
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, asm> {
+  let Inst{15-10} = 0b111101;
+  let Inst{4-2}   = Zd;
+  let Inst{1-0}   = 0b00;
+}
+
+class sme2_luti6_vector_vg4_strided<string asm>
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, asm> {
+  let Inst{15-10} = 0b111111;
+  let Inst{4}     = Zd{2};
+  let Inst{3-2}   = 0b00;
+  let Inst{1-0}   = Zd{1-0};
+}
+
 //===----------------------------------------------------------------------===//
 // SME2 MOV
 class sme2_mova_vec_to_tile_vg2_multi_base<bits<2> sz, bit v,
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 3cdd505..1664f4a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -3787,7 +3787,7 @@ multiclass sve2p1_two_way_dot_vv<string mnemonic, bit u, SDPatternOperator intri
 // SVE Integer Dot Product Group - Indexed Group
 //===----------------------------------------------------------------------===//
 
-class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
+class sve_intx_dot_by_indexed_elem<bit U, string asm,
                                    ZPRRegOp zprty1, ZPRRegOp zprty2,
                                    ZPRRegOp zprty3, Operand itype>
 : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
@@ -3795,8 +3795,7 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
   "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
-  let Inst{31-23} = 0b010001001;
-  let Inst{22}    = sz;
+  let Inst{31-24} = 0b01000100;
   let Inst{21}    = 0b1;
   let Inst{15-11} = 0;
   let Inst{10}    = U;
@@ -3810,16 +3809,18 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
 
 multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
                                         SDPatternOperator op> {
-  def _BtoS : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
+  def _BtoS : sve_intx_dot_by_indexed_elem<opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
     bits<2> iop;
     bits<3> Zm;
+    let Inst{23-22} = 0b10;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _HtoD : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
+  def _HtoD : sve_intx_dot_by_indexed_elem<opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
     bits<1> iop;
     bits<4> Zm;
-    let Inst{20} = iop;
+    let Inst{23-22} = 0b11;
+    let Inst{20}    = iop;
     let Inst{19-16} = Zm;
   }
 
@@ -3827,6 +3828,16 @@ multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
   def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _HtoD)>;
 }
 
+class sve_intx_dot_by_indexed_elem_x<bit opc, string asm>
+: sve_intx_dot_by_indexed_elem<opc, asm, ZPR16, ZPR8, ZPR3b8, VectorIndexH32b_timm> {
+ bits<3> iop;
+ bits<3> Zm;
+ let Inst{23}    = 0b0;
+ let Inst{22}    = iop{2};
+ let Inst{20-19} = iop{1-0};
+ let Inst{18-16} = Zm;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE2 Complex Integer Dot Product Group
 //===----------------------------------------------------------------------===//
@@ -4085,7 +4096,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
   bits<5> Zdn;
   let Inst{31-24} = 0b01000100;
   let Inst{23-22} = sz;
-  let Inst{21-20} = 0b01;
+  let Inst{21}    = 0b0;
   let Inst{20-16} = opc{5-1};
   let Inst{15-14} = 0b10;
   let Inst{13}    = opc{0};
@@ -4590,15 +4601,15 @@ multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> {
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>;
 }
 
-class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
+class sve2_int_absdiff_accum<bits<3> sz, bits<4> opc, string asm,
                              ZPRRegOp zprty1, ZPRRegOp zprty2>
 : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
   asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
   bits<5> Zm;
-  let Inst{31-24} = 0b01000101;
-  let Inst{23-22} = sz;
+  let Inst{31-25} = 0b0100010;
+  let Inst{24-22} = sz;
   let Inst{21}    = 0b0;
   let Inst{20-16} = Zm;
   let Inst{15-14} = 0b11;
@@ -4613,10 +4624,10 @@ class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
 }
 
 multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
-  def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
-  def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
-  def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
-  def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
+  def _B : sve2_int_absdiff_accum<0b100, { 0b111, opc }, asm, ZPR8, ZPR8>;
+  def _H : sve2_int_absdiff_accum<0b101, { 0b111, opc }, asm, ZPR16, ZPR16>;
+  def _S : sve2_int_absdiff_accum<0b110, { 0b111, opc }, asm, ZPR32, ZPR32>;
+  def _D : sve2_int_absdiff_accum<0b111, { 0b111, opc }, asm, ZPR64, ZPR64>;
 
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -4626,20 +4637,26 @@ multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
 
 multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
                                        SDPatternOperator op> {
-  def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
-  def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
-  def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+  def _H : sve2_int_absdiff_accum<0b101, { 0b00, opc }, asm, ZPR16, ZPR8>;
+  def _S : sve2_int_absdiff_accum<0b110, { 0b00, opc }, asm, ZPR32, ZPR16>;
+  def _D : sve2_int_absdiff_accum<0b111, { 0b00, opc }, asm, ZPR64, ZPR32>;
 
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve2_int_two_way_absdiff_accum_long<bit U, string asm> {
+  def _BtoH : sve2_int_absdiff_accum<0b001, { 0b01, U, 0b1 }, asm, ZPR16, ZPR8>;
+  def _HtoS : sve2_int_absdiff_accum<0b010, { 0b01, U, 0b1 }, asm, ZPR32, ZPR16>;
+  def _StoD : sve2_int_absdiff_accum<0b011, { 0b01, U, 0b1 }, asm, ZPR64, ZPR32>;
+}
+
 multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm,
                                       SDPatternOperator op> {
-  def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
+  def _S : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
                                   ZPR32, ZPR32>;
-  def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
+  def _D : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
                                   ZPR64, ZPR64>;
 
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
@@ -9610,17 +9627,18 @@ multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> {
 // SVE Floating Point Matrix Multiply Accumulate Group
 //===----------------------------------------------------------------------===//
 
-class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty>
+class sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty>
 : I<(outs zda_ty:$Zda), (ins zda_ty:$_Zda, reg_ty:$Zn, reg_ty:$Zm),
     asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
   bits<5> Zm;
   let Inst{31-24} = 0b01100100;
-  let Inst{23-22} = opc;
+  let Inst{23-22} = opc{2-1};
   let Inst{21}    = 1;
   let Inst{20-16} = Zm;
-  let Inst{15-10} = 0b111001;
+  let Inst{15-11} = 0b11100;
+  let Inst{10}    = opc{0};
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zda;
 
@@ -9630,10 +9648,12 @@ class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_t
   let mayRaiseFPException = 1;
 }
 
-multiclass sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty, SDPatternOperator op, ValueType zda_vt, ValueType reg_vt> {
+multiclass sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty,
+                             ZPRRegOp reg_ty, SDPatternOperator op,
+                             ValueType zda_vt, ValueType reg_vt> {
   def NAME : sve_fp_matrix_mla<opc, asm, zda_ty, reg_ty>;
 
-  def : SVE_3_Op_Pat<zda_vt, op , zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<zda_vt, op, zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -10030,18 +10050,19 @@ multiclass sve2p1_multi_vec_extract_narrow<string mnemonic, bits<2> opc, SDPatte
 }
 
 // SVE2 multi-vec shift narrow
-class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz>
-    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, vecshiftR16:$imm4),
-        mnemonic, "\t$Zd, $Zn, $imm4",
+class sve2p1_multi_vec_shift_narrow<string mnemonic, ZPRRegOp ZdRC, RegisterOperand ZSrcOp,
+                                    Operand immtype, bits<3> opc, bits<2> tsz>
+    : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn, immtype:$imm),
+        mnemonic, "\t$Zd, $Zn, $imm",
         "", []>, Sched<[]> {
   bits<5> Zd;
   bits<4> Zn;
-  bits<4> imm4;
+  bits<4> imm;
   let Inst{31-23} = 0b010001011;
   let Inst{22}    = tsz{1};
   let Inst{21}    = 0b1;
   let Inst{20}    = tsz{0};
-  let Inst{19-16} = imm4;
+  let Inst{18-16} = imm{2-0};  // imm3
   let Inst{15-14} = 0b00;
   let Inst{13-11} = opc;
   let Inst{10}    = 0b0;
@@ -10052,12 +10073,19 @@ class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz>
   let hasSideEffects = 0;
 }
 
-multiclass sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> {
-  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, opc, 0b01>;
+multiclass sve_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> {
+  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR16, ZZ_s_mul_r, vecshiftR16, opc, 0b01> {
+    let Inst{19} = imm{3}; // imm4
+  }
 
   def : SVE2p1_Sat_Shift_VG2_Pat<NAME, intrinsic, nxv8i16, nxv4i32, vecshiftR16>;
 }
 
+multiclass sve_multi_vec_round_shift_narrow<string mnemonic, bits<3> opc> {
+  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR8, ZZ_h_mul_r, vecshiftR8, opc, 0b00> {
+    let Inst{19} = 0b1;    // always 1 for imm3 version
+  }
+}
 
 // SME2 multi-vec contiguous load (scalar plus scalar, two registers)
 class sve2p1_mem_cld_ss_2z<string mnemonic, bits<2> msz, bit n,
@@ -11164,7 +11192,7 @@ multiclass sve2_fp8_dot_indexed_s<string asm, SDPatternOperator op> {
   def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>;
 }
 
-// FP8 Look up table
+// Look up table
 class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty,
                             Operand idx_ty, bits<4>opc, string mnemonic>
     : I<(outs zd_ty:$Zd), (ins zn_ty:$Zn, ZPRAny:$Zm, idx_ty:$idx),
@@ -11183,7 +11211,7 @@ class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty,
   let Inst{4-0}   = Zd;
 }
 
-// FP8 Look up table read with 2-bit indices
+// Look up table read with 2-bit indices
 multiclass sve2_luti2_vector_index<string mnemonic> {
   def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexS32b, {?, 0b100}, mnemonic> {
     bits<2> idx;
@@ -11205,7 +11233,7 @@ multiclass sve2_luti2_vector_index<string mnemonic> {
                          i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
 }
 
-// FP8 Look up table read with 4-bit indices
+// Look up table read with 4-bit indices
 multiclass sve2_luti4_vector_index<string mnemonic> {
   def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexD32b, 0b1001, mnemonic> {
     bit idx;
@@ -11226,7 +11254,7 @@ multiclass sve2_luti4_vector_index<string mnemonic> {
                          i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
 }
 
-// FP8 Look up table read with 4-bit indices (two contiguous registers)
+// Look up table read with 4-bit indices (two contiguous registers)
 multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
   def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
     bits<2> idx;
@@ -11250,6 +11278,29 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
                                                 nxv16i8:$Op3, timm32_0_3:$Op4))>;
 }
 
+// Look up table read with 6-bit indices
+multiclass sve2_luti6_vector_index<string mnemonic> {
+  def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexD32b, 0b1011, mnemonic> {
+    bit idx;
+    let Inst{23} = idx;
+  }
+}
+
+// Look up table
+class sve2_luti6_vector<string mnemonic>
+    : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
+      mnemonic, "\t$Zd, $Zn, $Zm",
+      "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-21} = 0b01000101001;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b101011;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
 //===----------------------------------------------------------------------===//
 // Checked Pointer Arithmetic (FEAT_CPA)
 //===----------------------------------------------------------------------===//
@@ -11280,3 +11331,49 @@ class sve_int_mla_cpa<string asm>
 
   let ElementSize = ZPR64.ElementSize;
 }
+
+//===----------------------------------------------------------------------===//
+// FP to Int down-converts
+//===----------------------------------------------------------------------===//
+class sve2_fp_to_int_downcvt<string asm, ZPRRegOp ZdRC, RegisterOperand ZSrcOp, bits<2> size, bit U>
+  : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn),
+      asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<4> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = size;
+  let Inst{21-11} = 0b00110100110;
+  let Inst{10}    = U;
+  let Inst{9-6}   = Zn;
+  let Inst{5}     = 0b0;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve2_fp_to_int_downcvt<string asm, bit U> {
+  def _HtoB : sve2_fp_to_int_downcvt<asm, ZPR8,  ZZ_h_mul_r, 0b01, U>;
+  def _StoH : sve2_fp_to_int_downcvt<asm, ZPR16, ZZ_s_mul_r, 0b10, U>;
+  def _DtoS : sve2_fp_to_int_downcvt<asm, ZPR32, ZZ_d_mul_r, 0b11, U>;
+}
+
+//===----------------------------------------------------------------------===//
+// Int to FP up-converts
+//===----------------------------------------------------------------------===//
+class sve2_int_to_fp_upcvt<string asm, ZPRRegOp ZdRC, ZPRRegOp ZnRC,
+                        bits<2> size, bits<2> U>
+  : I<(outs ZdRC:$Zd), (ins  ZnRC:$Zn),
+      asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = size;
+  let Inst{21-12} = 0b0011000011;
+  let Inst{11-10} = U;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve2_int_to_fp_upcvt<string asm, bits<2> U> {
+  def _BtoH : sve2_int_to_fp_upcvt<asm, ZPR16, ZPR8,  0b01, U>;
+  def _HtoS : sve2_int_to_fp_upcvt<asm, ZPR32, ZPR16, 0b10, U>;
+  def _StoD : sve2_int_to_fp_upcvt<asm, ZPR64, ZPR32, 0b11, U>;
+}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index d6cb0e8..268a229 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -139,6 +139,13 @@ namespace llvm {
 }
 
 namespace llvm {
+namespace AArch64CMHPriorityHint {
+#define GET_CMHPRIORITYHINT_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64CMHPriorityHint
+} // namespace llvm
+
+namespace llvm {
   namespace AArch64SysReg {
 #define GET_SysRegsList_IMPL
 #include "AArch64GenSystemOperands.inc"
@@ -190,6 +197,32 @@ namespace AArch64TLBIP {
 #define GET_TLBIPTable_IMPL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64TLBIP
+
+namespace AArch64MLBI {
+#define GET_MLBITable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64MLBI
+} // namespace llvm
+
+namespace llvm {
+namespace AArch64GIC {
+#define GET_GICTable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GIC
+} // namespace llvm
+
+namespace llvm {
+namespace AArch64GICR {
+#define GET_GICRTable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GICR
+} // namespace llvm
+
+namespace llvm {
+namespace AArch64GSB {
+#define GET_GSBTable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GSB
 } // namespace llvm
 
 namespace llvm {
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index fea33ef..27812e9 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -409,6 +409,16 @@ struct SysAliasReg : SysAlias {
       : SysAlias(N, E, F), NeedsReg(R) {}
 };
 
+struct SysAliasOptionalReg : SysAlias {
+  bool NeedsReg;
+  bool OptionalReg;
+  constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O)
+      : SysAlias(N, E), NeedsReg(R), OptionalReg(O) {}
+  constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O,
+                                FeatureBitset F)
+      : SysAlias(N, E, F), NeedsReg(R), OptionalReg(O) {}
+};
+
 struct SysAliasImm : SysAlias {
   uint16_t ImmValue;
   constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I)
@@ -677,6 +687,14 @@ namespace AArch64BTIHint {
 #include "AArch64GenSystemOperands.inc"
 }
 
+namespace AArch64CMHPriorityHint {
+struct CMHPriorityHint : SysAlias {
+  using SysAlias::SysAlias;
+};
+#define GET_CMHPRIORITYHINT_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64CMHPriorityHint
+
 namespace AArch64SME {
 enum ToggleCondition : unsigned {
   Always,
@@ -788,21 +806,53 @@ namespace AArch64SysReg {
 }
 
 namespace AArch64TLBI {
-  struct TLBI : SysAliasReg {
-    using SysAliasReg::SysAliasReg;
-  };
-  #define GET_TLBITable_DECL
-  #include "AArch64GenSystemOperands.inc"
+struct TLBI : SysAliasOptionalReg {
+  using SysAliasOptionalReg::SysAliasOptionalReg;
+};
+#define GET_TLBITable_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64TLBIP {
-struct TLBIP : SysAliasReg {
-  using SysAliasReg::SysAliasReg;
+struct TLBIP : SysAliasOptionalReg {
+  using SysAliasOptionalReg::SysAliasOptionalReg;
 };
 #define GET_TLBIPTable_DECL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64TLBIP
 
+namespace AArch64MLBI {
+struct MLBI : SysAliasReg {
+  using SysAliasReg::SysAliasReg;
+};
+#define GET_MLBITable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64MLBI
+
+namespace AArch64GIC {
+struct GIC : SysAliasReg {
+  using SysAliasReg::SysAliasReg;
+};
+#define GET_GICTable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GIC
+
+namespace AArch64GICR {
+struct GICR : SysAliasReg {
+  using SysAliasReg::SysAliasReg;
+};
+#define GET_GICRTable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GICR
+
+namespace AArch64GSB {
+struct GSB : SysAlias {
+  using SysAlias::SysAlias;
+};
+#define GET_GSBTable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GSB
+
 namespace AArch64II {
 /// Target Operand Flag enum.
 enum TOF {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8ed4062..1b559a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
       MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
-  setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
-                     MVT::i32, Legal);
+  setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
+                     Legal);
 
   setOperationAction(
       {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 99ba043..5580e4c 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1860,7 +1860,6 @@ private:
   bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
                               const unsigned CPol);
   bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
-  bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
   bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands);
   bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
   unsigned getConstantBusLimit(unsigned Opcode) const;
@@ -5506,22 +5505,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
   return true;
 }
 
-bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst,
-                                         const OperandVector &Operands) {
-  if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12)
-    return true;
-
-  int Simm16Pos =
-      AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16);
-  if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) {
-    SMLoc Loc = Operands[1]->getStartLoc();
-    Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]");
-    return false;
-  }
-
-  return true;
-}
-
 bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
                                    const OperandVector &Operands) {
   unsigned Opc = Inst.getOpcode();
@@ -5681,9 +5664,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc,
   if (!validateTFE(Inst, Operands)) {
     return false;
   }
-  if (!validateSetVgprMSB(Inst, Operands)) {
-    return false;
-  }
   if (!validateWMMA(Inst, Operands)) {
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 09ef6ac..2aa54c9 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -45,9 +45,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
 
-  // 32-bit ABS is legal for AMDGPU except for R600
-  setOperationAction(ISD::ABS, MVT::i32, Expand);
-
   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
   // spaces, so it is custom lowered to handle those where it isn't.
   for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a757421..be42291 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -298,7 +298,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC,
                      {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
 
-  setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
+  setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
 
   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ee10190..05ba76a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -976,10 +976,10 @@ def : GCNPat <
 } // End SubtargetPredicate = HasLshlAddU64Inst
 
 let SubtargetPredicate = HasAddMinMaxInsts in {
-def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
-def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
-def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
-def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
 }
 
 def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c4692b7..4ae2c1e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -464,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
 >;
 
 let SubtargetPredicate = HasPkAddMinMaxInsts in {
-def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
-def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
-def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
-def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
 }
 
 let SubtargetPredicate = HasPkMinMax3Insts in {
diff --git a/llvm/lib/Target/ARM/ARMArchitectures.td b/llvm/lib/Target/ARM/ARMArchitectures.td
index 301ed5b..bfcecfe 100644
--- a/llvm/lib/Target/ARM/ARMArchitectures.td
+++ b/llvm/lib/Target/ARM/ARMArchitectures.td
@@ -297,6 +297,18 @@ def ARMv96a   : Architecture<"armv9.6-a", "ARMv96a",  [HasV9_6aOps,
                                                        FeatureCRC,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
+def ARMv97a   : Architecture<"armv9.7-a", "ARMv97a",  [HasV9_7aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 9b1fa5d..e562b21 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -712,6 +712,11 @@ def HasV9_6aOps   : SubtargetFeature<"v9.6a", "HasV9_6aOps", "true",
                                    "Support ARM v9.6a instructions",
                                    [HasV9_5aOps]>;
 
+// Armv9.7-A is a v9-only architecture.
+def HasV9_7aOps   : SubtargetFeature<"v9.7a", "HasV9_7aOps", "true",
+                                   "Support ARM v9.7a instructions",
+                                   [HasV9_6aOps]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
                "Support ARM v8-1M Mainline instructions",
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 0796746..94b511a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -895,6 +895,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::ArchKind::ARMV9_4A:
   case ARM::ArchKind::ARMV9_5A:
   case ARM::ArchKind::ARMV9_6A:
+  case ARM::ArchKind::ARMV9_7A:
     S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     S.setAttributeItem(ARM_ISA_use, Allowed, false);
     S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 02fb905..4a2f714 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -1504,14 +1504,26 @@ let Defs = [SREG], hasSideEffects = 0 in
 def FRMIDX : Pseudo<(outs DLDREGS:$dst), (ins DLDREGS:$src, i16imm:$src2),
                     "frmidx\t$dst, $src, $src2", []>;
 
+// The instructions STDSPQRr and STDWSPQRr are used to store to the stack
+// frame. The most accurate implementation would be to load the SP into
+// a temporary pointer variable and then STDPtrQRr. However for efficiency,
+// we assume that R29R28 contains the current call frame pointer.
+// However in the PEI pass we sometimes rewrite a ADJCALLSTACKDOWN pseudo,
+// plus one or more STDSPQRr/STDWSPQRr pseudo instructions to use Z for a
+// stack adjustment then as a base pointer. To avoid corruption, we thus
+// specify special classes of registers, like GPR8 and DREGS, but with
+// the Z register removed, as the source/input to these instructions.
 // This pseudo is either converted to a regular store or a push which clobbers
 // SP.
-def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8:$src),
+let Defs = [SP], Uses = [SP], hasSideEffects = 0 in
+def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8NOZ:$src),
                            "stdstk\t$dst, $src", [(store i8:$src, addr:$dst)]>;
 
+// See the comment on STDSPQRr.
 // This pseudo is either converted to a regular store or a push which clobbers
 // SP.
-def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGS:$src),
+let Defs = [SP], Uses = [SP], hasSideEffects = 0 in
+def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGSNOZ:$src),
                             "stdwstk\t$dt, $src", [(store i16:$src, addr:$dt)]>;
 
 // SP read/write pseudos.
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index 182f92c..9b935b1 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -211,6 +211,31 @@ def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, (add R31R30, R29R28), ptr>;
 // model this using a register class containing only the Z register.
 def ZREG : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
 
+// general registers excluding Z register lo/hi, these are the only
+// registers that are always safe for STDSPQr instructions
+def GPR8NOZ : RegisterClass<"AVR", [i8], 8,
+                         (// Return value and argument registers.
+                          add R24, R25, R18, R19, R20, R21, R22, R23,
+                          // Scratch registers.
+                          R26, R27,
+                          // Callee saved registers.
+                          R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
+                          R9, R8, R7, R6, R5, R4, R3, R2, R0, R1)>;
+
+// 16-bit pair register class excluding Z register lo/hi, these are the only
+// registers that are always safe for STDWSPQr instructions
+def DREGSNOZ : RegisterClass<"AVR", [i16], 8,
+                          (// Return value and arguments.
+                           add R25R24, R19R18, R21R20, R23R22,
+                           // Scratch registers.
+                           R27R26,
+                           // Callee saved registers.
+                           R29R28, R17R16, R15R14, R13R12, R11R10, R9R8,
+                           R7R6, R5R4, R3R2, R1R0,
+                           // Pseudo regs for unaligned 16-bits
+                           R26R25, R24R23, R22R21, R20R19, R18R17, R16R15,
+                           R14R13, R12R11, R10R9)>;
+
 // Register class used for the stack read pseudo instruction.
 def GPRSP : RegisterClass<"AVR", [i16], 8, (add SP)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 26fe9ed..219e3f2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14797,7 +14797,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits.
       SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64,
                                 N->getOperand(0));
-      SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src);
+      SDValue Abs = DAG.getNode(RISCVISD::NEGW_MAX, DL, MVT::i64, Src);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs));
       return;
     }
@@ -21813,7 +21813,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
     // Output is either all zero or operand 0. We can propagate sign bit count
     // from operand 0.
     return DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
-  case RISCVISD::ABSW: {
+  case RISCVISD::NEGW_MAX: {
     // We expand this at isel to negw+max. The result will have 33 sign bits
     // if the input has at least 33 sign bits.
     unsigned Tmp =
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 4104abd..4c2f7f6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -482,7 +482,7 @@ let Predicates = [HasVendorXSfvfwmaccqqq] in {
   defm SF_VFWMACC_4x4x4 : VPseudoSiFiveVFWMACC;
 }
 
-let Predicates = [HasVendorXSfvfnrclipxfqf] in {
+let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in {
   defm SF_VFNRCLIP_XU_F_QF : VPseudoSiFiveVFNRCLIP;
   defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 62b7bcd..6b9a75f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -51,7 +51,7 @@ def riscv_zip     : RVSDNode<"ZIP",     SDTIntUnaryOp>;
 def riscv_unzip   : RVSDNode<"UNZIP",   SDTIntUnaryOp>;
 
 // RV64IZbb absolute value for i32. Expanded to (max (negw X), X) during isel.
-def riscv_absw    : RVSDNode<"ABSW",    SDTIntUnaryOp>;
+def riscv_negw_max : RVSDNode<"NEGW_MAX",    SDTIntUnaryOp>;
 
 // Scalar cryptography
 def riscv_clmul   : RVSDNode<"CLMUL",   SDTIntBinOp>;
@@ -610,7 +610,7 @@ def : PatGpr<riscv_clzw, CLZW>;
 def : PatGpr<riscv_ctzw, CTZW>;
 def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
 
-def : Pat<(i64 (riscv_absw GPR:$rs1)),
+def : Pat<(i64 (riscv_negw_max GPR:$rs1)),
           (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 0fce5b9..709e5f0 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -88,6 +88,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV9_4A:
   case ArchKind::ARMV9_5A:
   case ArchKind::ARMV9_6A:
+  case ArchKind::ARMV9_7A:
     return 9;
   case ArchKind::INVALID:
     return 0;
@@ -127,6 +128,7 @@ static ARM::ProfileKind getProfileKind(ARM::ArchKind AK) {
   case ARM::ArchKind::ARMV9_4A:
   case ARM::ArchKind::ARMV9_5A:
   case ARM::ArchKind::ARMV9_6A:
+  case ARM::ArchKind::ARMV9_7A:
     return ARM::ProfileKind::A;
   case ARM::ArchKind::ARMV4:
   case ARM::ArchKind::ARMV4T:
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index f6cea85..15ba1eb 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -46,6 +46,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v9.4a", "v9.4-a")
       .Case("v9.5a", "v9.5-a")
       .Case("v9.6a", "v9.6-a")
+      .Case("v9.7a", "v9.7-a")
       .Case("v8m.base", "v8-m.base")
       .Case("v8m.main", "v8-m.main")
       .Case("v8.1m.main", "v8.1-m.main")
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 1068ce4..11ba9ee 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -937,6 +937,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v9_5a;
   case ARM::ArchKind::ARMV9_6A:
     return Triple::ARMSubArch_v9_6a;
+  case ARM::ArchKind::ARMV9_7A:
+    return Triple::ARMSubArch_v9_7a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3e85e6f..c385c36 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4119,7 +4119,7 @@ static bool isAlreadyNarrow(VPValue *VPV) {
 void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                              unsigned VectorRegWidth) {
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
-  if (!VectorLoop)
+  if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
     return;
 
   VPTypeAnalysis TypeInfo(Plan);
diff --git a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
index 1e9a5f7..c0410cf 100644
--- a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -15,13 +17,18 @@ target triple = "armv7--linux-gnueabihf"
 %T464 = type <4 x i64>
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'direct'
+; COST-LABEL: 'direct'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
   %v1 = load %T432, ptr %loadaddr2
 ; ASM: vld1.64
-  %r3 = add %T432 %v0, %v1 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %v0, %v1
 ; ASM: vadd.i32
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -29,16 +36,22 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups1632'
+; COST-LABEL: 'ups1632'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T416, ptr %loadaddr2
 ; ASM: vldr
   %r1 = sext %T416 %v0 to %T432
   %r2 = sext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = add %T432 %r1, %r2 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %r1, %r2
 ; ASM: vaddl.s16
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -46,16 +59,22 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu1632'
+; COST-LABEL: 'upu1632'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T416, ptr %loadaddr2
 ; ASM: vldr
   %r1 = zext %T416 %v0 to %T432
   %r2 = zext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = add %T432 %r1, %r2 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %r1, %r2
 ; ASM: vaddl.u16
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -63,51 +82,66 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups3264'
+; COST-LABEL: 'ups3264'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T232, ptr %loadaddr2
 ; ASM: vldr
-  %r3 = add %T232 %v0, %v1 
+  %r3 = add %T232 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
   %st = sext %T232 %r3 to %T264
 ; ASM: vmovl.s32
-; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
   store %T264 %st, ptr %storeaddr
 ; ASM: vst1.64
   ret void
 }
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu3264'
+; COST-LABEL: 'upu3264'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T232, ptr %loadaddr2
 ; ASM: vldr
-  %r3 = add %T232 %v0, %v1 
+  %r3 = add %T232 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
   %st = zext %T232 %r3 to %T264
 ; ASM: vmovl.u32
-; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
   store %T264 %st, ptr %storeaddr
 ; ASM: vst1.64
   ret void
 }
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'dn3216'
+; COST-LABEL: 'dn3216'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
   %v1 = load %T432, ptr %loadaddr2
 ; ASM: vld1.64
-  %r3 = add %T432 %v0, %v1 
+  %r3 = add %T432 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
   %st = trunc %T432 %r3 to %T416
 ; ASM: vmovn.i32
-; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
   store %T416 %st, ptr %storeaddr
 ; ASM: vstr
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index c2248c2..e5bbac6 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Analysis/CostModel/ARM/freeshift.ll b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
index 51e87b5..cd5c8c5 100644
--- a/llvm/test/Analysis/CostModel/ARM/freeshift.ll
+++ b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @shl(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'shl'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = shl i32 %a, 3
   %ac = add i32 %b, %as
@@ -36,19 +36,19 @@ define void @shl(i32 %a, i32 %b) {
 
 define void @ashr(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'ashr'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = ashr i32 %a, 3
   %ac = add i32 %b, %as
@@ -67,19 +67,19 @@ define void @ashr(i32 %a, i32 %b) {
 
 define void @lshr(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'lshr'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = lshr i32 %a, 3
   %ac = add i32 %b, %as
diff --git a/llvm/test/Analysis/CostModel/ARM/gep.ll b/llvm/test/Analysis/CostModel/ARM/gep.ll
index 48de193..cce87a5 100644
--- a/llvm/test/Analysis/CostModel/ARM/gep.ll
+++ b/llvm/test/Analysis/CostModel/ARM/gep.ll
@@ -1,98 +1,98 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @testi8(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi8'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi8'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi8'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi8'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi8'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi8'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi8'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i8, ptr %a, i32 1
   %am4 = getelementptr inbounds i8, ptr %a, i32 -1
@@ -109,88 +109,88 @@ define void @testi8(ptr %a, i32 %i) {
 
 define void @testi16(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi16'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi16'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi16'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi16'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi16'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i16, ptr %a, i32 1
   %am4 = getelementptr inbounds i16, ptr %a, i32 -1
@@ -207,88 +207,88 @@ define void @testi16(ptr %a, i32 %i) {
 
 define void @testi32(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi32'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi32'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi32'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi32'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi32'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi32'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi32'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i32, ptr %a, i32 1
   %am4 = getelementptr inbounds i32, ptr %a, i32 -1
@@ -305,102 +305,102 @@ define void @testi32(ptr %a, i32 %i) {
 
 define void @testi64(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi64'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi64'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi64'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi64'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi64'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi64'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi64'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i64, ptr %a, i32 1
   %am4 = getelementptr inbounds i64, ptr %a, i32 -1
@@ -419,102 +419,102 @@ define void @testi64(ptr %a, i32 %i) {
 
 define void @testhalf(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testhalf'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testhalf'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testhalf'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testhalf'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testhalf'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testhalf'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testhalf'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds half, ptr %a, i32 1
   %am1 = getelementptr inbounds half, ptr %a, i32 -1
@@ -533,102 +533,102 @@ define void @testhalf(ptr %a, i32 %i) {
 
 define void @testfloat(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testfloat'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testfloat'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testfloat'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testfloat'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testfloat'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testfloat'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testfloat'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds float, ptr %a, i32 1
   %am1 = getelementptr inbounds float, ptr %a, i32 -1
@@ -647,102 +647,102 @@ define void @testfloat(ptr %a, i32 %i) {
 
 define void @testdouble(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testdouble'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testdouble'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testdouble'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testdouble'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testdouble'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testdouble'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testdouble'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds double, ptr %a, i32 1
   %am1 = getelementptr inbounds double, ptr %a, i32 -1
@@ -761,375 +761,375 @@ define void @testdouble(ptr %a, i32 %i) {
 
 define void @testvecs(i32 %i) {
 ; CHECK-V6M-LABEL: 'testvecs'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testvecs'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testvecs'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testvecs'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testvecs'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testvecs'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testvecs'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
   %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
diff --git a/llvm/test/Analysis/CostModel/ARM/immediates.ll b/llvm/test/Analysis/CostModel/ARM/immediates.ll
index ed13636..cd42313 100644
--- a/llvm/test/Analysis/CostModel/ARM/immediates.ll
+++ b/llvm/test/Analysis/CostModel/ARM/immediates.ll
@@ -1,145 +1,53 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size  -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size  -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency    -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency    -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-THROUGHPUT
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-THROUGHPUT
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output  -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output  -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @const_costs() {
-; CHECK-T1-SIZE-LABEL: 'const_costs'
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T1-LABEL: 'const_costs'
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T1-NEXT:  Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: ret i32 1
 ;
-; CHECK-T2-SIZE-LABEL: 'const_costs'
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-LATENCY-LABEL: 'const_costs'
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-LATENCY-LABEL: 'const_costs'
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T2-LABEL: 'const_costs'
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T2-NEXT:  Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: ret i32 1
 ;
   %add_1 = add i32 undef, 1
   %add_32767 = add i32 undef, 32767
diff --git a/llvm/test/Analysis/CostModel/ARM/insertelement.ll b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
index 5a922dd..f14b200 100644
--- a/llvm/test/Analysis/CostModel/ARM/insertelement.ll
+++ b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios6.0.0"
@@ -7,12 +8,16 @@ target triple = "thumbv7-apple-ios6.0.0"
 ; due to renaming constraints.
 %T_i8v = type <8 x i8>
 %T_i8 = type i8
-; CHECK: insertelement_i8
-define void @insertelement_i8(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i8(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i8'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <8 x i8>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %saddr, align 1
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <8 x i8> %v0, i8 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <8 x i8> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i8v, ptr %vaddr
   %v1 = load %T_i8, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8>
   %v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
   store %T_i8v %v2, ptr %vaddr
   ret void
@@ -21,12 +26,16 @@ define void @insertelement_i8(ptr %saddr,
 
 %T_i16v = type <4 x i16>
 %T_i16 = type i16
-; CHECK: insertelement_i16
-define void @insertelement_i16(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i16(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %saddr, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i16> %v0, i16 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i16v, ptr %vaddr
   %v1 = load %T_i16, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16>
   %v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
   store %T_i16v %v2, ptr %vaddr
   ret void
@@ -34,12 +43,16 @@ define void @insertelement_i16(ptr %saddr,
 
 %T_i32v = type <2 x i32>
 %T_i32 = type i32
-; CHECK: insertelement_i32
-define void @insertelement_i32(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i32(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i32'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %saddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <2 x i32> %v0, i32 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <2 x i32> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i32v, ptr %vaddr
   %v1 = load %T_i32, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32>
   %v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
   store %T_i32v %v2, ptr %vaddr
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
index 4404209..c98601f 100644
--- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s
+
 ; Check memory cost model action for a load of an unusually sized integer
 ; follow by and a trunc to a register sized integer gives a cost of 1 rather
 ; than the expanded cost if it is not.  Currently, this target does not have
 ; that expansion.
 
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK
-
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
   %trunc = trunc i128 %out to i32
@@ -20,8 +20,8 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i128 %out
 ;
   %out = load i128, ptr %ptr
   ret i128 %out
diff --git a/llvm/test/Analysis/CostModel/ARM/load_store.ll b/llvm/test/Analysis/CostModel/ARM/load_store.ll
index 4c322e9..dd2eaae 100644
--- a/llvm/test/Analysis/CostModel/ARM/load_store.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load_store.ll
@@ -1,171 +1,117 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=CHECK-V8-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @stores() {
 ; CHECK-NOVEC-LABEL: 'stores'
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store double undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store double undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-FP-LABEL: 'stores'
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'stores'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'stores'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'stores'
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'stores'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   store i8 undef, ptr undef, align 4
   store i16 undef, ptr undef, align 4
@@ -199,160 +145,108 @@ define void @stores() {
 
 define void @loads() {
 ; CHECK-NOVEC-LABEL: 'loads'
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-FP-LABEL: 'loads'
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'loads'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'loads'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'loads'
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'loads'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   load i8, ptr undef, align 4
   load i16, ptr undef, align 4
diff --git a/llvm/test/Analysis/CostModel/ARM/logicalop.ll b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
index 967426c..82eb716 100644
--- a/llvm/test/Analysis/CostModel/ARM/logicalop.ll
+++ b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
@@ -1,72 +1,40 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @op() {
   ; Logical and/or - select's cost must be equivalent to that of binop
-; CHECK-MVE-RECIP-LABEL: 'op'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'op'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'op'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'op'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'op'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'op'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'op'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'op'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'op'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'op'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'op'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'op'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %sand = select i1 undef, i1 undef, i1 false
   %band = and i1 undef, undef
@@ -77,61 +45,33 @@ define void @op() {
 }
 
 define void @vecop() {
-; CHECK-MVE-RECIP-LABEL: 'vecop'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'vecop'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'vecop'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'vecop'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vecop'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'vecop'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'vecop'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'vecop'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'vecop'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %band = and <4 x i1> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %bor = or <4 x i1> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'vecop'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %band = and <4 x i1> undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %bor = or <4 x i1> undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'vecop'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'vecop'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> <i1 false, i1 false, i1 false, i1 false>
   %band = and <4 x i1> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
index 17d4263..07d2bd0 100644
--- a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
index 7de2799..3ae02cd 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
 
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -26,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
 
 define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 ; CHECK-LABEL: 'withadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -51,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 
 define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 ; CHECK-LABEL: 'withloads'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %a = load i16, ptr %pa
     %b = load i16, ptr %pb
@@ -82,18 +82,18 @@ define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 
 define i64 @different_extend_ops(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'different_extend_ops'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'different_extend_ops'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = sext i16 %a to i32
     %bs = zext i16 %b to i32
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
index 521816d13..04a9520 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
@@ -1,20 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = zext i16 %a to i32
     %bs = zext i16 %b to i32
@@ -25,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
 
 define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 ; CHECK-LABEL: 'withadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %as = zext i16 %a to i32
     %bs = zext i16 %b to i32
@@ -50,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 
 define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 ; CHECK-LABEL: 'withloads'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %a = load i16, ptr %pa
     %b = load i16, ptr %pb
diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll
index f626901..de429fd2 100644
--- a/llvm/test/Analysis/CostModel/ARM/select.ll
+++ b/llvm/test/Analysis/CostModel/ARM/select.ll
@@ -1,272 +1,140 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @selects() {
   ; Scalar values
-; CHECK-MVE-RECIP-LABEL: 'selects'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-LABEL: 'selects'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:10 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-NEON-RECIP-LABEL: 'selects'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-LABEL: 'selects'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 19 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 50 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 100 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-THUMB1-RECIP-LABEL: 'selects'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB1-LABEL: 'selects'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
-; CHECK-THUMB2-RECIP-LABEL: 'selects'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'selects'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'selects'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'selects'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'selects'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB2-LABEL: 'selects'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %v0 = select i1 undef, i1 undef, i1 undef
   %v1 = select i1 undef, i8 undef, i8 undef
diff --git a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
index 465611d..5ef869e 100644
--- a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/shuffle.ll b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
index a17bbab..0fd1456 100644
--- a/llvm/test/Analysis/CostModel/ARM/shuffle.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
@@ -1,59 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @broadcast() {
 ; CHECK-MVE-LABEL: 'broadcast'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:40 Lat:80 SizeLat:80 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:10 SizeLat:10 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'broadcast'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 8 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 14 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 26 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 50 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
@@ -90,58 +90,58 @@ define void @broadcast() {
 ;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords, on neon)
 define void @reverse() {
 ; CHECK-MVE-LABEL: 'reverse'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'reverse'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 18 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -179,40 +179,40 @@ define void @reverse() {
 
 define void @concat() {
 ; CHECK-MVE-LABEL: 'concat'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'concat'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -241,54 +241,54 @@ define void @concat() {
 
 define void @select() {
 ; CHECK-MVE-LABEL: 'select'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'select'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 6 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -324,40 +324,40 @@ define void @select() {
 
 define void @vrev2() {
 ; CHECK-MVE-LABEL: 'vrev2'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'vrev2'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 8 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -386,26 +386,26 @@ define void @vrev2() {
 
 define void @vrev4() {
 ; CHECK-MVE-LABEL: 'vrev4'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'vrev4'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
diff --git a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
index df26121..924a629 100644
--- a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index b3a7057..c551375 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
-; SDAG-LABEL: add_max_u32_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; GCN-LABEL: add_max_u32_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
-; GCN-LABEL: add_max_u32_sss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_co_i32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_max_u32 s0, s0, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; SDAG-LABEL: add_max_u32_sss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_add_nc_u32_e64 v0, s0, s1 clamp
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    v_max_u32_e32 v0, s2, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_u32_sss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GISEL-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 4)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, 0x64
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 100)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
-define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
-; SDAG-LABEL: add_max_u32_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32 v0, 0x64, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_addk_co_i32 s0, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, 100
-  %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+  %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
@@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
-; SDAG-LABEL: add_max_v2u16_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s2, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s3, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s2, s2, s3
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+; GCN-LABEL: add_max_v2u16_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
 define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
 ; SDAG-LABEL: add_max_v2u16_sss:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_pk_max_u16 v0, v0, s2
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-LABEL: add_max_v2u16_sss:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s3, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s4, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s3, s3, s4
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GISEL-NEXT:    s_and_b32 s3, s2, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
-; GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GISEL-NEXT:    s_max_u32 s1, s1, s2
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
 ; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, 0x650064
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
-; SDAG-LABEL: add_max_v2u16_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, 0x640064
-; GISEL-NEXT:    s_addk_co_i32 s1, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, <i16 100, i16 100>
+; GCN-LABEL: add_max_v2u16_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 711d57b..30ad46d9 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX1250-NEXT:    v_cls_i32_e32 v3, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX1250-NEXT:    v_min_u32_e32 v2, v3, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_min_u32 v2, v3, -1, v2
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp i64 %x to bfloat
@@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX1250-NEXT:    v_xor_b32_e32 v4, v2, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v6, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v7, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
+; GFX1250-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_min_u32 v5, v7, -1, v5
-; GFX1250-NEXT:    v_add_min_u32 v4, v6, -1, v4
+; GFX1250-NEXT:    v_min_u32_e32 v5, v7, v5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp <2 x i64> %x to <2 x bfloat>
@@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250TRUE16:       ; %bb.0:
 ; GFX1250TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v6, v4, v5
+; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v4, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v10, v3
-; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v9, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v11, v1
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v7, v10, -1, v7
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v6, v9, -1, v6
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
-; GFX1250TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
+; GFX1250TRUE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250TRUE16-NEXT:    v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_bitop2_b32 v8, v2, v3 bitop3:0x14
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_add_nc_u32 v11, -1, v11
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v7, v10, v8
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v8, v11, v9
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
 ; GFX1250TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX1250TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v8, v11, -1, v8
-; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
-; GFX1250TRUE16-NEXT:    v_sub_nc_u32_e32 v5, 32, v8
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250TRUE16-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v5, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v4
 ; GFX1250TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX1250TRUE16-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -40017,44 +40019,47 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250FAKE16:       ; %bb.0:
 ; GFX1250FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v6, v2, v3
+; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v7, v2, v3
 ; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v10, v3
-; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v9, v5
 ; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v11, v1
-; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7
-; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_add_min_u32 v6, v10, -1, v6
-; GFX1250FAKE16-NEXT:    v_add_min_u32 v7, v11, -1, v7
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v8, v4, v5 bitop3:0x14
+; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v7, v[0:1]
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250FAKE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v11, -1, v11 :: v_dual_ashrrev_i32 v9, 31, v9
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v8, 32, v8
+; GFX1250FAKE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v7, v10, v7
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v6, v6, v8
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX1250FAKE16-NEXT:    v_add_min_u32 v8, v9, -1, v8
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v9, v11, v9
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v8, v[4:5]
-; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v9, v[0:1]
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX1250FAKE16-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_ldexp_f32 v1, v1, v8
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX1250FAKE16-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250FAKE16-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
@@ -40644,51 +40649,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX1250-NEXT:    v_cls_i32_e32 v9, v7
 ; GFX1250-NEXT:    v_xor_b32_e32 v8, v6, v7
-; GFX1250-NEXT:    v_cls_i32_e32 v12, v7
-; GFX1250-NEXT:    v_cls_i32_e32 v13, v5
-; GFX1250-NEXT:    v_cls_i32_e32 v14, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8
-; GFX1250-NEXT:    v_xor_b32_e32 v10, v2, v3
-; GFX1250-NEXT:    v_cls_i32_e32 v15, v1
-; GFX1250-NEXT:    v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14
-; GFX1250-NEXT:    v_add_min_u32 v9, v13, -1, v9
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_min_u32 v8, v12, -1, v8
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[4:5], v9, v[4:5]
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX1250-NEXT:    v_add_min_u32 v10, v14, -1, v10
+; GFX1250-NEXT:    v_cls_i32_e32 v10, v5
+; GFX1250-NEXT:    v_xor_b32_e32 v14, v0, v1
+; GFX1250-NEXT:    v_cls_i32_e32 v12, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v9, -1, v9 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_bitop2_b32 v11, v4, v5 bitop3:0x14
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_min_u32 v11, v15, -1, v11
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v10, v[2:3]
-; GFX1250-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX1250-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT:    v_dual_add_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v13, v2, v3 bitop3:0x14
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX1250-NEXT:    v_cls_i32_e32 v13, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_ashrrev_i32 v14, 31, v14 :: v_dual_add_nc_u32 v11, 32, v11
+; GFX1250-NEXT:    v_dual_add_nc_u32 v12, -1, v12 :: v_dual_add_nc_u32 v9, 32, v9
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v13, -1, v13 :: v_dual_add_nc_u32 v14, 32, v14
+; GFX1250-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX1250-NEXT:    v_min_u32_e32 v11, v13, v14
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[4:5], v10, v[4:5]
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v9, v[2:3]
+; GFX1250-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v11, v[0:1]
+; GFX1250-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_or_b32_e32 v6, v7, v6
 ; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
-; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v9
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX1250-NEXT:    v_cvt_f32_i32_e32 v1, v6
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v8
+; GFX1250-NEXT:    v_cvt_f32_i32_e32 v3, v6
+; GFX1250-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX1250-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1250-NEXT:    v_ldexp_f32 v1, v3, v1
 ; GFX1250-NEXT:    v_ldexp_f32 v3, v4, v7
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_ldexp_f32 v2, v2, v5
diff --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll
index 3face71..b32910b 100644
--- a/llvm/test/CodeGen/AVR/dynalloca.ll
+++ b/llvm/test/CodeGen/AVR/dynalloca.ll
@@ -64,16 +64,16 @@ define void @dynalloca2(i16 %x) {
 ; CHECK-NEXT: out 63, r0
 ; CHECK-NEXT: out 61, {{.*}}
 ; Store values on the stack
-; CHECK: ldi r20, 0
-; CHECK: ldi r21, 0
-; CHECK: std Z+8, r21
-; CHECK: std Z+7, r20
-; CHECK: std Z+6, r21
-; CHECK: std Z+5, r20
-; CHECK: std Z+4, r21
-; CHECK: std Z+3, r20
-; CHECK: std Z+2, r21
-; CHECK: std Z+1, r20
+; CHECK: ldi [[REG1:r[0-9]+]], 0
+; CHECK: ldi [[REG2:r[0-9]+]], 0
+; CHECK: std Z+8, [[REG2]]
+; CHECK: std Z+7, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
+; CHECK: std Z+1, [[REG1]]
 ; CHECK: call
 ; Call frame restore
 ; CHECK-NEXT: in r30, 61
diff --git a/llvm/test/CodeGen/AVR/issue-163015.ll b/llvm/test/CodeGen/AVR/issue-163015.ll
new file mode 100644
index 0000000..6c4dc51
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/issue-163015.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=avr | FileCheck %s
+
+@ui1 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@ui2 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@failed = private unnamed_addr addrspace(1) constant [12 x i8] c"test failed\00"
+@stats2 = external protected global i16, align 1
+
+; CHECK-LABEL: main:
+define i32 @main() addrspace(1) {
+entry:
+  store i64 94, ptr @ui1, align 8
+  store i64 53, ptr @ui2, align 8
+  tail call addrspace(1) void @foo(i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 32, ptr @stats2)
+  %11 = load i64, ptr @ui1, align 8
+  %12 = load i64, ptr @ui2, align 8
+
+; COM: CHECK: call __udivdi3
+  %15 = udiv i64 %11, %12
+
+; look for the buggy pattern where r30/r31 are being clobbered, corrupting the stack pointer
+; CHECK-NOT: std  Z+{{[1-9]+}}, r30 
+; CHECK-NOT: std  Z+{{[1-9]+}}, r31
+
+; CHECK: call expect
+  tail call addrspace(1) void @expect(i64 %15, i64 1, i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 33)
+
+; CHECK: ret
+  ret i32 0
+}
+
+declare protected void @expect(i64, i64, i16, i16, i8, i16) local_unnamed_addr addrspace(1) #0
+declare protected void @foo(i16, i16, i8, i16, i16) local_unnamed_addr addrspace(1) #0
diff --git a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
index f82cd11..f12fc4a8 100644
--- a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
+++ b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
@@ -23,6 +23,6 @@ entry:
 
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) #1
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
index 64e569c..3d40240 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
@@ -63,7 +63,7 @@
     ret double %conv
   }
 
-  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
   !llvm.module.flags = !{!0}
   !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
index 0bb6061..8f76ad5 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
@@ -35,7 +35,7 @@
     ret double %conv
   }
 
-  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
   !llvm.module.flags = !{!0}
   !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
index a992222..1612485 100644
--- a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
@@ -64,9 +64,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind willreturn }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
index 8e7a463..69689f0 100644
--- a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
+++ b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
@@ -18,5 +18,5 @@ entry:
 ; Function Attrs: uwtable
 declare void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(ptr) unnamed_addr #0 align 2
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9fab8b9..3e7b73a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -270,6 +270,82 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
   ret <1 x i64> %ret
 }
 
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_ptr:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movq (%rsp), %rax
+; CHECK-O3-NEXT:    popq %rcx
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:    popq %rcx
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:    popq %rcx
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_ptr:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq (%rsp), %rax
+; CHECK-O0-NEXT:    popq %rcx
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:    popq %rcx
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:    popq %rcx
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
+
 define <1 x half> @atomic_vec1_half(ptr %x) {
 ; CHECK-O3-LABEL: atomic_vec1_half:
 ; CHECK-O3:       # %bb.0:
@@ -386,3 +462,515 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
   %ret = load atomic <1 x double>, ptr %x acquire, align 8
   ret <1 x double> %ret
 }
+
+define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_i64:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movq (%rsp), %rax
+; CHECK-O3-NEXT:    popq %rcx
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:    popq %rcx
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:    popq %rcx
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i64:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq (%rsp), %rax
+; CHECK-O0-NEXT:    popq %rcx
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:    popq %rcx
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:    popq %rcx
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i64>, ptr %x acquire, align 4
+  ret <1 x i64> %ret
+}
+
+define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    popq %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    popq %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    popq %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    popq %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    popq %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    popq %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x double>, ptr %x acquire, align 4
+  ret <1 x double> %ret
+}
+
+define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec2_i32:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    popq %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    popq %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    popq %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_i32:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    popq %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    popq %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    popq %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <2 x i32>, ptr %x acquire, align 4
+  ret <2 x i32> %ret
+}
+
+define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_float:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $24, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $16, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    addq $24, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_float:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $24, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $16, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    addq $24, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_float:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    subq $24, %rsp
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $16, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovaps (%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    addq $24, %rsp
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_float:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $24, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $16, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    addq $24, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_float:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $24, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $16, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    addq $24, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_float:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $24, %rsp
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $16, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovaps (%rsp), %xmm0
+; CHECK-AVX-O0-NEXT:    addq $24, %rsp
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <4 x float>, ptr %x acquire, align 4
+  ret <4 x float> %ret
+}
+
+define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec8_double:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $72, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $64, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT:    addq $72, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec8_double:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $72, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $64, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT:    addq $72, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec8_double:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $72, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $64, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movapd (%rsp), %xmm0
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT:    addq $72, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec8_double:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $72, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $64, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movapd (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT:    addq $72, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+  %ret = load atomic <8 x double>, ptr %x acquire, align 4
+  ret <8 x double> %ret
+}
+
+define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $40, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $32, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    addq $40, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $40, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $32, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    addq $40, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    subq $40, %rsp
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $32, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX-O3-NEXT:    addq $40, %rsp
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $40, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $32, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    addq $40, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $40, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $32, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    addq $40, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $40, %rsp
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $32, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX-O0-NEXT:    addq $40, %rsp
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <16 x bfloat>, ptr %x acquire, align 4
+  ret <16 x bfloat> %ret
+}
+
+define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec32_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $72, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $64, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT:    addq $72, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec32_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $72, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $64, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT:    addq $72, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec32_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $72, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $64, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT:    addq $72, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec32_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $72, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $64, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT:    addq $72, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+  %ret = load atomic <32 x half>, ptr %x acquire, align 4
+  ret <32 x half> %ret
+}
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
index 99fee27..88d7682 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
@@ -52,7 +52,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
index 50b2433..8dbd4e2 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
@@ -63,7 +63,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
index 7a4b993..c0924ea 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
@@ -73,7 +73,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
   
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
index da9d16c..f074390 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
+++ b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
@@ -502,6 +502,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index b4ba239..7fd4f59 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -48,5 +48,5 @@ entry:
 ; Function Attrs: nounwind readnone
 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1
 
-attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/bit-piece-comment.ll b/llvm/test/CodeGen/X86/bit-piece-comment.ll
index d74863f..85c64a7 100644
--- a/llvm/test/CodeGen/X86/bit-piece-comment.ll
+++ b/llvm/test/CodeGen/X86/bit-piece-comment.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/catchpad-regmask.ll b/llvm/test/CodeGen/X86/catchpad-regmask.ll
index 9dba897..713d015 100644
--- a/llvm/test/CodeGen/X86/catchpad-regmask.ll
+++ b/llvm/test/CodeGen/X86/catchpad-regmask.ll
@@ -130,7 +130,7 @@ unreachable:                                      ; preds = %entry
 ; CHECK: retq                            # CATCHRET
 
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { noreturn }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/catchpad-weight.ll b/llvm/test/CodeGen/X86/catchpad-weight.ll
index e97f358..699243d 100644
--- a/llvm/test/CodeGen/X86/catchpad-weight.ll
+++ b/llvm/test/CodeGen/X86/catchpad-weight.ll
@@ -74,8 +74,8 @@ declare void @"\01??1HasDtor@@QEAA@XZ"(ptr) #3
 ; Function Attrs: nounwind argmemonly
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind argmemonly }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #4 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/clang-section-coff.ll b/llvm/test/CodeGen/X86/clang-section-coff.ll
index 02381fd..6b76bb6 100644
--- a/llvm/test/CodeGen/X86/clang-section-coff.ll
+++ b/llvm/test/CodeGen/X86/clang-section-coff.ll
@@ -37,8 +37,8 @@ attributes #0 = { "bss-section"="my_bss.1" "data-section"="my_data.1" "rodata-se
 attributes #1 = { "data-section"="my_data.1" "rodata-section"="my_rodata.1" }
 attributes #2 = { "bss-section"="my_bss.2" "rodata-section"="my_rodata.1" }
 attributes #3 = { "bss-section"="my_bss.2" "data-section"="my_data.2" "rodata-section"="my_rodata.2" }
-attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
diff --git a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
index 01e7019..863f580 100644
--- a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
+++ b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
@@ -65,4 +65,4 @@ declare i32 @__CxxFrameHandler3(...)
 
 declare x86_thiscallcc void @"\01??1A@@QAE@XZ"(ptr) #0
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/complex-fastmath.ll b/llvm/test/CodeGen/X86/complex-fastmath.ll
index 29a37a1..21bb64a 100644
--- a/llvm/test/CodeGen/X86/complex-fastmath.ll
+++ b/llvm/test/CodeGen/X86/complex-fastmath.ll
@@ -212,4 +212,4 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
   ret <2 x double> %14
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true"  }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true"  }
diff --git a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
index ddddcfa..9f51fa4 100644
--- a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
+++ b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
@@ -264,5 +264,5 @@ unreachable:                                      ; preds = %cleanup100
 ; Function Attrs: nounwind
 declare void @printf(ptr nocapture readonly, ...) #1
 
-attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/dag-optnone.ll b/llvm/test/CodeGen/X86/dag-optnone.ll
index 66e4c1d..022694e 100644
--- a/llvm/test/CodeGen/X86/dag-optnone.ll
+++ b/llvm/test/CodeGen/X86/dag-optnone.ll
@@ -28,7 +28,7 @@
 ; a repeated fadd that can be combined into an fmul.  We show that this
 ; happens in both the non-optnone function and the optnone function.
 
-define float @foo(float %x, ...) #0 {
+define float @foo(float %x, ...) {
 entry:
   %add = fadd fast float %x, %x
   %add1 = fadd fast float %add, %x
@@ -68,5 +68,4 @@ entry:
   ret void
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
-attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone }
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index deba5a8..18e5490 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -112,9 +112,9 @@ declare void @_Z3fooPcjPKc(ptr, i32, ptr) #2
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
index fabdbbb..c688895 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -68,8 +68,8 @@ _ZN7Flibble3barEP6Wibble.exit:                    ; preds = %entry, %if.then.i
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !1 = distinct !DISubprogram()
diff --git a/llvm/test/CodeGen/X86/dbg-combine.ll b/llvm/test/CodeGen/X86/dbg-combine.ll
index b3d2213..3ff5a26 100644
--- a/llvm/test/CodeGen/X86/dbg-combine.ll
+++ b/llvm/test/CodeGen/X86/dbg-combine.ll
@@ -63,7 +63,7 @@ declare ptr @llvm.stacksave() #2
 ; Function Attrs: nounwind
 declare void @llvm.stackrestore(ptr) #2
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/debug-loclists-lto.ll b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
index fde8e00..2bd927f 100644
--- a/llvm/test/CodeGen/X86/debug-loclists-lto.ll
+++ b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
@@ -34,8 +34,8 @@ entry:
   ret void, !dbg !29
 }
 
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone speculatable willreturn }
 
 !llvm.dbg.cu = !{!0, !7}
diff --git a/llvm/test/CodeGen/X86/debugloc-argsize.ll b/llvm/test/CodeGen/X86/debugloc-argsize.ll
index 3cfeb6e..f4527c5 100644
--- a/llvm/test/CodeGen/X86/debugloc-argsize.ll
+++ b/llvm/test/CodeGen/X86/debugloc-argsize.ll
@@ -30,7 +30,7 @@ declare ptr @__cxa_begin_catch(ptr)
 
 declare void @__cxa_end_catch()
 
-attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { optsize }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/early-cfi-sections.ll b/llvm/test/CodeGen/X86/early-cfi-sections.ll
index 3a9e62a..8ab0340 100644
--- a/llvm/test/CodeGen/X86/early-cfi-sections.ll
+++ b/llvm/test/CodeGen/X86/early-cfi-sections.ll
@@ -12,7 +12,7 @@ entry:
   ret void, !dbg !8
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/CodeGen/X86/fadd-combines.ll b/llvm/test/CodeGen/X86/fadd-combines.ll
index 2c06c53..a44671c 100644
--- a/llvm/test/CodeGen/X86/fadd-combines.ll
+++ b/llvm/test/CodeGen/X86/fadd-combines.ll
@@ -275,4 +275,4 @@ define <2 x double> @fmul2_negated_vec(<2 x double> %a, <2 x double> %b, <2 x do
   ret <2 x double> %sub
 }
 
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 5afa12c..1bc94b1 100644
--- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -65,5 +65,5 @@ entry:
 declare i16 @llvm.convert.to.fp16.f64(double)
 declare i16 @llvm.convert.to.fp16.f80(x86_fp80)
 
-attributes #0 = { nounwind readnone "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll
index 67bad09..859f54e 100644
--- a/llvm/test/CodeGen/X86/fdiv.ll
+++ b/llvm/test/CodeGen/X86/fdiv.ll
@@ -54,7 +54,7 @@ define double @denormal2(double %x) {
 
 ; Deleting the negates does not require unsafe-fp-math.
 
-define float @double_negative(float %x, float %y) #0 {
+define float @double_negative(float %x, float %y) {
 ; CHECK-LABEL: double_negative:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    divss %xmm1, %xmm0
@@ -65,7 +65,7 @@ define float @double_negative(float %x, float %y) #0 {
   ret float %div
 }
 
-define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
+define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: double_negative_vector:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    divps %xmm1, %xmm0
@@ -80,7 +80,7 @@ define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
 ; clang/gcc), due to order of argument evaluation not being well defined. We
 ; ended up hitting llvm_unreachable in getNegatedExpression when building with
 ; gcc. Just make sure that we get a deterministic result.
-define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
+define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) {
 ; CHECK-LABEL: fdiv_fneg_combine:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, %xmm3
@@ -99,6 +99,3 @@ define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
   %div5 = fdiv fast float %mul2, %sub4
   ret float %div5
 }
-
-attributes #0 = { "unsafe-fp-math"="false" }
-
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index 4c16cf9..0c3ec8d 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1021,7 +1021,7 @@ define <8 x double> @test_v8f64_interp_ninf(<8 x double> %x, <8 x double> %y, <8
 ; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
 ;
 
-define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; FMA-LABEL: test_v16f32_fneg_fmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
@@ -1044,7 +1044,7 @@ define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1,
   ret <16 x float> %neg
 }
 
-define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; FMA-LABEL: test_v8f64_fneg_fmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
@@ -1067,7 +1067,7 @@ define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <
   ret <8 x double> %neg
 }
 
-define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; FMA-LABEL: test_v16f32_fneg_fnmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm4
@@ -1091,7 +1091,7 @@ define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1,
   ret <16 x float> %neg1
 }
 
-define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; FMA-LABEL: test_v8f64_fneg_fnmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
@@ -1119,7 +1119,7 @@ define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1,
 ; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
 ;
 
-define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
+define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) {
 ; FMA-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1146,7 +1146,7 @@ define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
 ; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
 ;
 
-define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
@@ -1171,7 +1171,7 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
 
 ; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
 
-define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) {
 ; FMA-LABEL: test_v16f32_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorps %xmm4, %xmm4, %xmm4
@@ -1196,7 +1196,7 @@ define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0
   ret <16 x float> %n
 }
 
-define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) {
 ; FMA-LABEL: test_v8f64_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
@@ -1221,7 +1221,7 @@ define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
   ret <8 x double> %n
 }
 
-define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) {
 ; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
@@ -1250,7 +1250,6 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
   ret <8 x double> %n
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX512-INFS: {{.*}}
 ; FMA-INFS: {{.*}}
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 5ea2964..d60d397 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -158,7 +158,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i64 undef
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
index bd32430..fc5279d0 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
@@ -100,40 +100,52 @@ entry:
   ret i32 %result
 }
 
-; These 2 divs only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
 define void @binop_cse(double %a, double %b, ptr %x, ptr %y) #0 {
 entry:
 ; CHECK-LABEL: name: binop_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
-; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
-; CHECK: %3:fr64 = DIVSDrm [[MOVSDrm_alt]], %fixed-stack.2, 1, $noreg, 0, $noreg, implicit $mxcsr :: (load (s64) from %fixed-stack.2)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
+; CHECK: [[B:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
+; CHECK: [[A:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
+; CHECK: [[DIV0:%[0-9]+]]:fr64 = DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: [[DIV1:%[0-9]+]]:fr64 = nofpexcept DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.y, align 4)
 ; CHECK: RET 0
   %div = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  %div1 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
-  store double %div, ptr %x
-  store double %div2, ptr %y
+  %div3 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  store double %div2, ptr %x
+  store double %div3, ptr %y
   ret void
 }
 
-; These 2 sitofps only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
 define void @sitofp_cse(i32 %a, ptr %x, ptr %y) #0 {
 entry:
 ; CHECK-LABEL: name: sitofp_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
-; CHECK: %2:fr64 = CVTSI2SDrm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
+; CHECK: [[A:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
+; CHECK: [[CVT0:%[0-9]+]]:fr64 = CVTSI2SDrr [[A]]
+; CHECK: [[CVT1:%[0-9]+]]:fr64 = nofpexcept CVTSI2SDrr [[A]]
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.y, align 4)
 ; CHECK: RET 0
   %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  %result1 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   %result2 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
-  store double %result, ptr %x
-  store double %result2, ptr %y
+  %result3 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  store double %result2, ptr %x
+  store double %result3, ptr %y
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/fp128-g.ll b/llvm/test/CodeGen/X86/fp128-g.ll
index 58a57d3..d2b956f 100644
--- a/llvm/test/CodeGen/X86/fp128-g.ll
+++ b/llvm/test/CodeGen/X86/fp128-g.ll
@@ -106,8 +106,8 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!2}
diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index f176a29..ef616ca 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -526,6 +526,6 @@ cleanup:                                          ; preds = %entry, %if.then
 }
 
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/frame-order.ll b/llvm/test/CodeGen/X86/frame-order.ll
index dcbcb48..f410acf 100644
--- a/llvm/test/CodeGen/X86/frame-order.ll
+++ b/llvm/test/CodeGen/X86/frame-order.ll
@@ -74,9 +74,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare void @capture(ptr) #2
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/fsafdo_test2.ll b/llvm/test/CodeGen/X86/fsafdo_test2.ll
index d83e241..fc4c1e8 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test2.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test2.ll
@@ -196,10 +196,10 @@ if.end9.3:
 }
 
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
index 7bb3bf42..4347d62 100644
--- a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -130,5 +130,5 @@ for.end:                                          ; preds = %for.cond.preheader
 ; Function Attrs: nounwind
 declare i32 @varfunc(ptr nocapture readonly, ...) #0
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
index f982196..11a1f39 100644
--- a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg16b
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/label-annotation.ll b/llvm/test/CodeGen/X86/label-annotation.ll
index 626040c..05e4e87 100644
--- a/llvm/test/CodeGen/X86/label-annotation.ll
+++ b/llvm/test/CodeGen/X86/label-annotation.ll
@@ -77,8 +77,8 @@ entry:
 
 
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { inaccessiblememonly noduplicate nounwind }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/label-heapallocsite.ll b/llvm/test/CodeGen/X86/label-heapallocsite.ll
index 31bca25..72834be6c 100644
--- a/llvm/test/CodeGen/X86/label-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/label-heapallocsite.ll
@@ -98,8 +98,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 ; CHECK-NEXT:  .short [[LABEL5]]-[[LABEL4]]
 ; CHECK-NEXT:  .long 4096
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone speculatable willreturn }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/late-remat-update.mir b/llvm/test/CodeGen/X86/late-remat-update.mir
index 3212312..9108002 100644
--- a/llvm/test/CodeGen/X86/late-remat-update.mir
+++ b/llvm/test/CodeGen/X86/late-remat-update.mir
@@ -39,8 +39,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #2 = { nounwind }
   
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
index 5199b15..96780af0 100644
--- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -92,4 +92,4 @@ if.end:
 ; CHECK:	pushl ([[REG3]])
 }
 
-attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll
index 3efaccb..22e350c 100644
--- a/llvm/test/CodeGen/X86/lifetime-alias.ll
+++ b/llvm/test/CodeGen/X86/lifetime-alias.ll
@@ -140,10 +140,10 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture reado
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
 
-attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 attributes #5 = { noreturn nounwind }
 attributes #6 = { builtin nounwind }
diff --git a/llvm/test/CodeGen/X86/limit-split-cost.mir b/llvm/test/CodeGen/X86/limit-split-cost.mir
index 5b8bb98..8e4e786 100644
--- a/llvm/test/CodeGen/X86/limit-split-cost.mir
+++ b/llvm/test/CodeGen/X86/limit-split-cost.mir
@@ -53,8 +53,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #2 = { nounwind }
   
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
index 3dba5eb..206d453 100644
--- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
+++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
@@ -41,5 +41,5 @@ if.end:                                           ; preds = %entry
 
 declare <4 x float> @_Z1bv() local_unnamed_addr 
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/misched-copy.ll b/llvm/test/CodeGen/X86/misched-copy.ll
index fa6cd15..e3ceddf 100644
--- a/llvm/test/CodeGen/X86/misched-copy.ll
+++ b/llvm/test/CodeGen/X86/misched-copy.ll
@@ -42,7 +42,7 @@ end:
   ret i64 %add
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !0 = !{!"float", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll
index a6c489d..9029167 100644
--- a/llvm/test/CodeGen/X86/misched-matmul.ll
+++ b/llvm/test/CodeGen/X86/misched-matmul.ll
@@ -222,4 +222,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/movpc32-check.ll b/llvm/test/CodeGen/X86/movpc32-check.ll
index e3730d0..4585dcb 100644
--- a/llvm/test/CodeGen/X86/movpc32-check.ll
+++ b/llvm/test/CodeGen/X86/movpc32-check.ll
@@ -12,8 +12,8 @@ entry:
 
 declare void @bar(...) #1
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
index ec17b1d..04db25b 100644
--- a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
+++ b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
@@ -20,5 +20,5 @@ entry:
 ; CHECK: movq    %rax, 7(%rsp)
 ; CHECK: retq
 
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/nocf_check.ll b/llvm/test/CodeGen/X86/nocf_check.ll
index 7b184ed..742b07d 100644
--- a/llvm/test/CodeGen/X86/nocf_check.ll
+++ b/llvm/test/CodeGen/X86/nocf_check.ll
@@ -66,8 +66,8 @@ bb2:
   ret void
 }
 
-attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #2 = { nocf_check }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll
index 3dd4aab..2de9a34 100644
--- a/llvm/test/CodeGen/X86/pr15705.ll
+++ b/llvm/test/CodeGen/X86/pr15705.ll
@@ -45,4 +45,4 @@ return:
   ret i32 %retval.0
 }
 
-attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr18846.ll b/llvm/test/CodeGen/X86/pr18846.ll
index 93a9a5d..4239f46 100644
--- a/llvm/test/CodeGen/X86/pr18846.ll
+++ b/llvm/test/CodeGen/X86/pr18846.ll
@@ -122,7 +122,7 @@ for.body65:                                       ; preds = %for.body29
 ; Function Attrs: nounwind
 declare void @llvm.x86.avx.storeu.ps.256(ptr, <8 x float>) #1
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr31045.ll b/llvm/test/CodeGen/X86/pr31045.ll
index 4aa73d7..78ba7cc 100644
--- a/llvm/test/CodeGen/X86/pr31045.ll
+++ b/llvm/test/CodeGen/X86/pr31045.ll
@@ -73,4 +73,4 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr32610.ll b/llvm/test/CodeGen/X86/pr32610.ll
index dc11ba8..6f3602d 100644
--- a/llvm/test/CodeGen/X86/pr32610.ll
+++ b/llvm/test/CodeGen/X86/pr32610.ll
@@ -50,7 +50,7 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index de34bfb1..279373a 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -132,4 +132,4 @@ define void @computeJD(ptr) nounwind {
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34080.ll b/llvm/test/CodeGen/X86/pr34080.ll
index d07d1aa..3b46bd3 100644
--- a/llvm/test/CodeGen/X86/pr34080.ll
+++ b/llvm/test/CodeGen/X86/pr34080.ll
@@ -162,4 +162,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34629.ll b/llvm/test/CodeGen/X86/pr34629.ll
index eeb61d2..f7747b1 100644
--- a/llvm/test/CodeGen/X86/pr34629.ll
+++ b/llvm/test/CodeGen/X86/pr34629.ll
@@ -38,7 +38,7 @@ if.end:                                           ; preds = %entry, %if.then
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr34634.ll b/llvm/test/CodeGen/X86/pr34634.ll
index a374112..980961a 100644
--- a/llvm/test/CodeGen/X86/pr34634.ll
+++ b/llvm/test/CodeGen/X86/pr34634.ll
@@ -54,7 +54,7 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll
index cf1fa5a..18e884b 100644
--- a/llvm/test/CodeGen/X86/pr42727.ll
+++ b/llvm/test/CodeGen/X86/pr42727.ll
@@ -29,5 +29,5 @@ entry:
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/pr48064.mir b/llvm/test/CodeGen/X86/pr48064.mir
index eb74edd..d2eaea7 100644
--- a/llvm/test/CodeGen/X86/pr48064.mir
+++ b/llvm/test/CodeGen/X86/pr48064.mir
@@ -185,8 +185,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.x86.seh.ehregnode(ptr) #7
 
-  attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+  attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
   attributes #2 = { argmemonly nofree nosync nounwind willreturn }
   attributes #3 = { nofree }
   attributes #4 = { nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index 0d5d822..e497575 100644
--- a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -178,4 +178,4 @@ bb439:                                            ; preds = %bb222, %bb85
   ret void
 }
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index dab7a6a..f8d28ae 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -1400,7 +1400,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
   ret <16 x float> %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
 
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 77ccaff..7fa13cb 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -1841,8 +1841,8 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
   ret <16 x float> %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #3 = { "reciprocal-estimates"="divf:0,vec-divf:0" }
 
diff --git a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
index 50422a8..ea1ca51 100644
--- a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
+++ b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
@@ -72,7 +72,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 %add
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/regparm.ll b/llvm/test/CodeGen/X86/regparm.ll
index 6d6802e..95009b5 100644
--- a/llvm/test/CodeGen/X86/regparm.ll
+++ b/llvm/test/CodeGen/X86/regparm.ll
@@ -38,7 +38,7 @@ entry:
 declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) #1
 
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index cb85f39..85e465b 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -189,9 +189,9 @@ entry:
 ; Function Attrs: nounwind
 declare i32 @puts(ptr nocapture readonly) #3
 
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
 attributes #4 = { noinline }
 attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 539d776..fedb0c4 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -136,10 +136,10 @@ declare ptr @llvm.localaddress() #4
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.eh.typeid.for(ptr) #4
 
-attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
 attributes #4 = { nounwind readnone }
 attributes #5 = { noinline }
 attributes #6 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll
index 63e91d3..112031c 100644
--- a/llvm/test/CodeGen/X86/seh-no-invokes.ll
+++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll
@@ -63,8 +63,8 @@ declare i32 @_except_handler3(...)
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #3
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
index fe42d31..7e98b8a 100644
--- a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
+++ b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
@@ -29,4 +29,4 @@ if.end3:                                          ; preds = %if.end
   ret void
 }
 
-attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index a260b32..83bfcd7 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1012,15 +1012,15 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
   ret double %sqrt_fast
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
+attributes #0 = { "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="sqrt,vec-sqrt" }
 attributes #2 = { nounwind readnone }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
-attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
-attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
-attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
+attributes #3 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
+attributes #4 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #5 = { "reciprocal-estimates"="all:0" }
+attributes #6 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
 
-; Attributes without "unsafe-fp-math"="true"
+; Attributes without
 ; TODO: Merge with previous attributes when this attribute can be deleted.
 attributes #7 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" } ; #3
 attributes #8 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" } ; #6
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 8ac86d1..5005752 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -251,5 +251,5 @@ define <2 x float> @PR31672() #0 {
 
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
 
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 665a84a..d7c9438 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1912,7 +1912,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1927,7 +1927,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1941,7 +1941,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1956,7 +1956,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1970,7 +1970,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1985,7 +1985,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1999,7 +1999,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2014,7 +2014,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2028,7 +2028,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2043,7 +2043,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2058,7 +2058,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2073,7 +2073,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2088,7 +2088,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2103,7 +2103,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2118,7 +2118,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2133,7 +2133,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2147,7 +2147,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2162,7 +2162,7 @@ define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2176,7 +2176,7 @@ define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2191,7 +2191,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2205,7 +2205,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2220,7 +2220,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2234,7 +2234,7 @@ define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2249,7 +2249,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2279,7 +2279,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
 }
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2294,7 +2294,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2309,7 +2309,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3632,6 +3632,3 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   ret <8 x float> %6
 }
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
index a75cdf9d..43743d5 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -609,7 +609,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -695,7 +695,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
   ret <8 x double> %4
 }
 
-define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -781,7 +781,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x
   ret <16 x float> %4
 }
 
-define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -867,7 +867,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
   ret <8 x double> %4
 }
 
-define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1157,7 +1157,7 @@ define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %5
 }
 
-define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1178,7 +1178,7 @@ define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
   ret <8 x double> %6
 }
 
-define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_orps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1414,7 +1414,7 @@ define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %5
 }
 
-define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1435,7 +1435,7 @@ define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
   ret <8 x double> %6
 }
 
-define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_xorps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2058,5 +2058,4 @@ define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i3
   ret <16 x float> %4
 }
 
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index 52d4d8b..b715df8 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -502,7 +502,7 @@ define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, pt
   ret <8 x half> %3
 }
 
-define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -517,7 +517,7 @@ define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 }
 declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
-define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -532,7 +532,7 @@ define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %
   ret <32 x half> %2
 }
 
-define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_k:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -552,7 +552,7 @@ define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -573,7 +573,7 @@ define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half>
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_kz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -590,7 +590,7 @@ define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
   ret <32 x half> %4
 }
 
-define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -710,7 +710,7 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0,
   ret <32 x half> %4
 }
 
-define half @stack_fold_maxsh(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_maxsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -725,7 +725,7 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 {
   ret half %3
 }
 
-define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh_commuted(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_maxsh_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -772,7 +772,7 @@ define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
   ret half %3
 }
 
-define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxsh_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -820,7 +820,7 @@ define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %ma
   ret <8 x half> %2
 }
 
-define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -835,7 +835,7 @@ define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 }
 declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
-define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_zmm_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -850,7 +850,7 @@ define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %
   ret <32 x half> %2
 }
 
-define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_minph_zmm_k:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -870,7 +870,7 @@ define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -891,7 +891,7 @@ define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half>
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_minph_zmm_kz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -908,7 +908,7 @@ define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
   ret <32 x half> %4
 }
 
-define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1028,7 +1028,7 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0,
   ret <32 x half> %4
 }
 
-define half @stack_fold_minsh(half %a0, half %a1) #0 {
+define half @stack_fold_minsh(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_minsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1043,7 +1043,7 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 {
   ret half %3
 }
 
-define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_minsh_commuted(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_minsh_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1090,7 +1090,7 @@ define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
   ret half %3
 }
 
-define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minsh_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2316,5 +2316,4 @@ define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1,
 }
 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
index 4fed6bc..cd06f2d 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -381,7 +381,7 @@ define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
   ret <16 x half> %3
 }
 
-define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -396,7 +396,7 @@ define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
 }
 declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
-define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -410,7 +410,7 @@ define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #
   ret <8 x half> %2
 }
 
-define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -425,7 +425,7 @@ define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 }
 declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
-define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -439,7 +439,7 @@ define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half>
   ret <16 x half> %2
 }
 
-define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -454,7 +454,7 @@ define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
 }
 declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
-define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -468,7 +468,7 @@ define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #
   ret <8 x half> %2
 }
 
-define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -483,7 +483,7 @@ define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 }
 declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
-define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1471,6 +1471,3 @@ define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a
   ret <8 x float> %3
 }
 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index b370a80..bd56e61 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -457,7 +457,7 @@ define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
   ret <4 x float> %2
 }
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -472,7 +472,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -486,7 +486,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -501,7 +501,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -515,7 +515,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -530,7 +530,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -544,7 +544,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -559,7 +559,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -573,7 +573,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -588,7 +588,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -602,7 +602,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -617,7 +617,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -687,7 +687,7 @@ define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %2
 }
 
-define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -708,7 +708,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
   ret <2 x double> %6
 }
 
-define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -939,7 +939,7 @@ define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %2
 }
 
-define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -960,7 +960,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
   ret <2 x double> %6
 }
 
-define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1391,6 +1391,3 @@ declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <
 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 306ee31..9bc9a9c 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -1424,7 +1424,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1439,7 +1439,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1453,7 +1453,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1468,7 +1468,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1482,7 +1482,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1497,7 +1497,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1512,7 +1512,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1527,7 +1527,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1542,7 +1542,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1557,7 +1557,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1572,7 +1572,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1587,7 +1587,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1601,7 +1601,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1616,7 +1616,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1630,7 +1630,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1645,7 +1645,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1660,7 +1660,7 @@ define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1675,7 +1675,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1690,7 +1690,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1705,7 +1705,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2490,6 +2490,3 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
 
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-protector-3.ll b/llvm/test/CodeGen/X86/stack-protector-3.ll
index 59784af..8ca6a56 100644
--- a/llvm/test/CodeGen/X86/stack-protector-3.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-3.ll
@@ -118,7 +118,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
 ; Function Attrs: argmemonly nounwind willreturn
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
 
-attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
index 63390e4..4bc91bf 100644
--- a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
@@ -55,6 +55,6 @@ entry:
 
 declare void @f(i32) #1
 
-attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/stack_guard_remat.ll b/llvm/test/CodeGen/X86/stack_guard_remat.ll
index f7a602c..f53fa0b4 100644
--- a/llvm/test/CodeGen/X86/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/X86/stack_guard_remat.ll
@@ -23,4 +23,4 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/tail-merge-wineh.ll b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
index 00bddc1..a208368 100644
--- a/llvm/test/CodeGen/X86/tail-merge-wineh.ll
+++ b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
@@ -101,5 +101,5 @@ declare x86_stdcallcc void @_CxxThrowException(ptr, ptr)
 
 declare i32 @__CxxFrameHandler3(...)
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { noreturn }
diff --git a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
index 2749ebd..f0d5a16 100644
--- a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
@@ -51,6 +51,6 @@ if.end:                                           ; preds = %if.then, %entry
 
 declare void @f(...) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/unused_stackslots.ll b/llvm/test/CodeGen/X86/unused_stackslots.ll
index d909dd4..4d390bd 100644
--- a/llvm/test/CodeGen/X86/unused_stackslots.ll
+++ b/llvm/test/CodeGen/X86/unused_stackslots.ll
@@ -215,8 +215,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64,
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/uwtables.ll b/llvm/test/CodeGen/X86/uwtables.ll
index 1e2e1d9..68a5ff1 100644
--- a/llvm/test/CodeGen/X86/uwtables.ll
+++ b/llvm/test/CodeGen/X86/uwtables.ll
@@ -38,5 +38,5 @@ declare i32 @__gxx_personality_v0(...)
 declare void @__cxa_call_unexpected(ptr) local_unnamed_addr
 
 
-attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 910dd1e..5954e34 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -5435,7 +5435,7 @@ define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
   ret double %r
 }
 
-define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
+define void @PR43609(ptr nocapture %x, <2 x i64> %y) {
 ; SSE2-LABEL: PR43609:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
@@ -5643,6 +5643,3 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
   store <2 x double> %t23, ptr %t26, align 8
   ret void
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
-
diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll
index b08784a..843f099a 100644
--- a/llvm/test/CodeGen/X86/vector-sqrt.ll
+++ b/llvm/test/CodeGen/X86/vector-sqrt.ll
@@ -63,6 +63,6 @@ entry:
 ; Function Attrs: nounwind readnone
 declare float @sqrtf(float) local_unnamed_addr #1
 
-attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone  "target-features"="+avx2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone  "target-features"="+avx2" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
index 50c7b01..9363348 100644
--- a/llvm/test/CodeGen/X86/vector-width-store-merge.ll
+++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -85,8 +85,8 @@ entry:
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1 immarg) #1
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 
 !0 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e9265a1..59dcccc 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -194,6 +194,6 @@ cleanup.outer:                                      ; preds = %invoke.cont.1, %c
 ; X64-NEXT: .long   .Ltmp7@IMGREL
 ; X64-NEXT: .long   -1
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
index 832ddca..0f51866 100644
--- a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
@@ -220,7 +220,7 @@ declare i32 @_except_handler3(...)
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #2
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 attributes #3 = { noinline }
diff --git a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
index 5b1f9b3..5095460 100644
--- a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
@@ -34,8 +34,8 @@ declare void @f(i32) #0
 
 declare i32 @_except_handler3(...)
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { noinline }
 
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
index 785c260..d4d4fe3 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
@@ -272,12 +272,12 @@ declare dso_local void @llvm.seh.try.end() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.eh.exceptioncode(token) #3
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind willreturn }
 attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #6 = { nounwind }
 attributes #7 = { noreturn }
 attributes #8 = { noinline }
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
index 6c6e9c3..b0baaac 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
@@ -240,12 +240,12 @@ declare i32 @llvm.eh.exceptioncode(token) #1
 
 declare dso_local void @"?printf@@YAXZZ"(...) #5
 
-attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #6 = { nounwind }
 attributes #7 = { noinline }
 
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
index 9e44299..d3da5f8 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
@@ -209,10 +209,10 @@ declare i32 @llvm.eh.exceptioncode(token) #4
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #5
 
-attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind willreturn }
-attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind readnone }
 attributes #5 = { nounwind }
 attributes #6 = { noinline }
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index cb12481..9f888f8 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -24,7 +24,7 @@ entry:
   ret i64 %or
 }
 
-attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
 ; clang -Os -c test2.cpp -emit-llvm -S
@@ -63,7 +63,7 @@ entry:
   ret i64 %or
 }
 
-attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ; clang -O2 -c test2.cpp -emit-llvm -S
 ; Verify that we do not generate shld insruction when we are not optimizing
@@ -89,7 +89,7 @@ entry:
   ret i64 %or
 }
 
-attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"ProfileSummary", !1}
diff --git a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
index cf8d21658..15efbee 100644
--- a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
@@ -16,7 +16,7 @@ fmmla v0.4s, v1.4s, v2.4s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 fmmla v0.8h, v1.8h, v2.8h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction requires: f16mm
 // CHECK-NEXT: fmmla v0.8h, v1.8h, v2.8h
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
new file mode 100644
index 0000000..c25ff66
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
@@ -0,0 +1,191 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z0.h, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z0.h, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.s, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.s, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.d, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.d, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vectors/mis-matched registers/invalid index
+
+luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers/index
+
+luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid registers
+
+luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid lookup table, expected zt0
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers
+
+luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector register expected
+// CHECK-NEXT: luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6.s b/llvm/test/MC/AArch64/SME2p3/luti6.s
new file mode 100644
index 0000000..7a7872f
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6.s
@@ -0,0 +1,472 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (single)
+
+luti6 z0.b, zt0, z0
+// CHECK-INST: luti6 z0.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x00,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c84000 <unknown>
+
+luti6 z31.b, zt0, z0
+// CHECK-INST: luti6 z31.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x1f,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c8401f <unknown>
+
+luti6 z0.b, zt0, z31
+// CHECK-INST: luti6 z0.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xe0,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843e0 <unknown>
+
+luti6 z31.b, zt0, z31
+// CHECK-INST: luti6 z31.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xff,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843ff <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - consecutive
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f400 <unknown>
+
+luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x08,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f408 <unknown>
+
+luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x14,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f414 <unknown>
+
+luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x1c,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f41c <unknown>
+
+luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e0 <unknown>
+
+luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe8,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e8 <unknown>
+
+luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf4,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7f4 <unknown>
+
+luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xfc,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7fc <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - strided
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc00 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x01,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc01 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x02,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc02 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x03,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc03 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x10,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc10 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x11,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc11 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x12,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc12 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x13,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc13 <unknown>
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe0 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe1 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe2 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe3 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff0 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff1 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff2 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff3 <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - consecutive
+
+luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x08,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0008 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x14,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0014 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x1c,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a001c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0100 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x08,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0108 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x14,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0114 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x1c,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a011c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0280 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x88,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0288 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x94,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0294 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x9c,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a029c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0380 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x88,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0388 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x94,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0394 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x9c,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a039c <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - strided
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x01,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0001 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x02,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0002 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x03,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0003 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x10,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0010 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x11,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0011 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x12,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0012 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x13,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0013 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0100 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x01,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0101 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x02,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0102 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x03,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0103 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x10,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0110 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x11,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0111 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x12,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0112 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x13,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0113 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0280 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x81,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0281 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x82,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0282 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x83,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0283 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x90,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0290 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x91,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0291 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x92,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0292 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x93,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0293 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0380 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x81,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0381 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x82,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0382 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x83,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0383 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x90,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0390 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x91,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0391 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x92,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0392 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x93,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0393 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
index 409c2c5..4695b05 100644
--- a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
@@ -5,11 +5,6 @@ bfmmla z0.s, z1.s, z2.h
 // CHECK-NEXT: bfmmla z0.s, z1.s, z2.h
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-bfmmla z0.h, z1.h, z2.h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: bfmmla z0.h, z1.h, z2.h
-// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
-
 bfmmla z0.s, z1.h, z2.s
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
 // CHECK-NEXT: bfmmla z0.s, z1.h, z2.s
diff --git a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
index 344abb1..42c6ae7 100644
--- a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
@@ -29,7 +29,7 @@ sdot z0.s, z0.h, z0.h[-1]
 // Invalid vector suffix
 
 sdot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: sdot z0.h, z0.s, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
index 0debddf..aa2ec7f 100644
--- a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
@@ -29,11 +29,11 @@ udot z0.s, z0.h, z0.h[-1]
 // Invalid vector suffix
 
 udot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: udot z0.h, z0.s, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 udot z0.h, z0.s, z0.s[1]
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: udot z0.h, z0.s, z0.s[1]
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
new file mode 100644
index 0000000..e88fcce
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
@@ -0,0 +1,15 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+fmmla z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmmla z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.d, p0/z, z6.d
+fmmla z0.h, z2.h, z3.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmmla z0.h, z2.h, z3.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla.s b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
new file mode 100644
index 0000000..19929a9
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p2,+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p2,-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p2,+f16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla z0.h, z0.h, z0.h
+// CHECK-INST: fmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xa0,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a0e000 <unknown>
+
+fmmla z10.h, z10.h, z10.h
+// CHECK-INST: fmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xaa,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64aae14a <unknown>
+
+fmmla z21.h, z21.h, z21.h
+// CHECK-INST: fmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xb5,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64b5e2b5 <unknown>
+
+fmmla z31.h, z31.h, z31.h
+// CHECK-INST: fmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xbf,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64bfe3ff <unknown>
+
+movprfx z0, z7
+fmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: fmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xa2,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
new file mode 100644
index 0000000..2c0cf07
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
@@ -0,0 +1,147 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Test addqp
+
+addqp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.h, p0/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.h, p0/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.s, p0/m, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.s, p0/m, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.d, p0/m, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.d, p0/m, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.b, p0/m, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.b, p0/m, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Predicate not in restricted predicate range
+
+subp z0.h, p8/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: subp z0.h, p8/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Operand must match destination register
+
+subp z0.b, p0/m, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must match destination register
+// CHECK-NEXT: subp z0.b, p0/m, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+addqp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addqp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+addsubp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addsubp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
new file mode 100644
index 0000000..12df18d
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
@@ -0,0 +1,275 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+addqp z0.b, z0.b, z0.b
+// CHECK-INST: addqp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x78,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207800 <unknown>
+
+addqp z31.b, z31.b, z31.b
+// CHECK-INST: addqp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7bff <unknown>
+
+addqp z0.h, z0.h, z0.h
+// CHECK-INST: addqp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x78,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607800 <unknown>
+
+addqp z31.h, z31.h, z31.h
+// CHECK-INST: addqp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7bff <unknown>
+
+addqp z0.s, z0.s, z0.s
+// CHECK-INST: addqp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x78,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07800 <unknown>
+
+addqp z31.s, z31.s, z31.s
+// CHECK-INST: addqp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7bff <unknown>
+
+addqp z0.d, z0.d, z0.d
+// CHECK-INST: addqp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x78,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07800 <unknown>
+
+addqp z31.d, z31.d, z31.d
+// CHECK-INST: addqp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7bff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.b, z0.b, z0.b
+// CHECK-INST: addsubp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207c00 <unknown>
+
+addsubp z31.b, z31.b, z31.b
+// CHECK-INST: addsubp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7fff <unknown>
+
+addsubp z0.h, z0.h, z0.h
+// CHECK-INST: addsubp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607c00 <unknown>
+
+addsubp z31.h, z31.h, z31.h
+// CHECK-INST: addsubp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7fff <unknown>
+
+addsubp z0.s, z0.s, z0.s
+// CHECK-INST: addsubp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07c00 <unknown>
+
+addsubp z31.s, z31.s, z31.s
+// CHECK-INST: addsubp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7fff <unknown>
+
+addsubp z0.d, z0.d, z0.d
+// CHECK-INST: addsubp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07c00 <unknown>
+
+addsubp z31.d, z31.d, z31.d
+// CHECK-INST: addsubp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7fff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.h, z0.b, z0.b
+// CHECK-INST: sabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440d400 <unknown>
+
+sabal z31.h, z31.b, z31.b
+// CHECK-INST: sabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fd7ff <unknown>
+
+sabal z0.s, z0.h, z0.h
+// CHECK-INST: sabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480d400 <unknown>
+
+sabal z31.s, z31.h, z31.h
+// CHECK-INST: sabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fd7ff <unknown>
+
+sabal z0.d, z0.s, z0.s
+// CHECK-INST: sabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xd4,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0d400 <unknown>
+
+sabal z31.d, z31.s, z31.s
+// CHECK-INST: sabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xd7,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfd7ff <unknown>
+
+movprfx z0, z7
+sabal	z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xd4,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442d420 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.h, z0.b, z0.b
+// CHECK-INST: uabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440dc00 <unknown>
+
+uabal z31.h, z31.b, z31.b
+// CHECK-INST: uabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fdfff <unknown>
+
+uabal z0.s, z0.h, z0.h
+// CHECK-INST: uabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480dc00 <unknown>
+
+uabal z31.s, z31.h, z31.h
+// CHECK-INST: uabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fdfff <unknown>
+
+uabal z0.d, z0.s, z0.s
+// CHECK-INST: uabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xdc,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0dc00 <unknown>
+
+uabal z31.d, z31.s, z31.s
+// CHECK-INST: uabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xdf,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfdfff <unknown>
+
+movprfx z0, z7
+uabal	z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: uabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xdc,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442dc20 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.b, p0/m, z0.b, z0.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a000 <unknown>
+
+subp z31.b, p7/m, z31.b, z31.b
+// CHECK-INST: subp z31.b, p7/m, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410bfff <unknown>
+
+subp z0.h, p0/m, z0.h, z0.h
+// CHECK-INST: subp z0.h, p0/m, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450a000 <unknown>
+
+subp z31.h, p7/m, z31.h, z31.h
+// CHECK-INST: subp z31.h, p7/m, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450bfff <unknown>
+
+subp z0.s, p0/m, z0.s, z0.s
+// CHECK-INST: subp z0.s, p0/m, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490a000 <unknown>
+
+subp z31.s, p7/m, z31.s, z31.s
+// CHECK-INST: subp z31.s, p7/m, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490bfff <unknown>
+
+subp z0.d, p0/m, z0.d, z0.d
+// CHECK-INST: subp z0.d, p0/m, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0xa0,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0a000 <unknown>
+
+subp z31.d, p7/m, z31.d, z31.d
+// CHECK-INST: subp z31.d, p7/m, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0xbf,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0bfff <unknown>
+
+movprfx z0.b, p0/m, z7.b
+subp	z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0.b, p0/m, z7.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
+
+movprfx z0, z7
+subp	z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
new file mode 100644
index 0000000..28ec78d
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
@@ -0,0 +1,19 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+bfmmla z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
new file mode 100644
index 0000000..77440ee
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve-b16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+bfmmla z0.h, z0.h, z0.h
+// CHECK-INST: bfmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xe0,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e0e000 <unknown>
+
+bfmmla z10.h, z10.h, z10.h
+// CHECK-INST: bfmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xea,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64eae14a <unknown>
+
+bfmmla z21.h, z21.h, z21.h
+// CHECK-INST: bfmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xf5,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64f5e2b5 <unknown>
+
+bfmmla z31.h, z31.h, z31.h
+// CHECK-INST: bfmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xff,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64ffe3ff <unknown>
+
+movprfx z0, z7
+bfmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: bfmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xe2,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
new file mode 100644
index 0000000..68a50ab
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
@@ -0,0 +1,193 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzun z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzun z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzun z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzun z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt.s b/llvm/test/MC/AArch64/SVE2p3/cvt.s
new file mode 100644
index 0000000..da9c463
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt.s
@@ -0,0 +1,321 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to signed integer, rounding toward zero
+
+fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3000 <unknown>
+
+fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d301f <unknown>
+
+fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33c0 <unknown>
+
+fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33df <unknown>
+
+fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3000 <unknown>
+
+fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d301f <unknown>
+
+fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33c0 <unknown>
+
+fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33df <unknown>
+
+fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3000 <unknown>
+
+fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd301f <unknown>
+
+fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33c0 <unknown>
+
+fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33df <unknown>
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to unsigned integer, rounding toward zero
+
+fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3400 <unknown>
+
+fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d341f <unknown>
+
+fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37c0 <unknown>
+
+fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37df <unknown>
+
+fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3400 <unknown>
+
+fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d341f <unknown>
+
+fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37c0 <unknown>
+
+fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37df <unknown>
+
+fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3400 <unknown>
+
+fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd341f <unknown>
+
+fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37c0 <unknown>
+
+fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37df <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (bottom, unpredicated)
+
+scvtf z0.h, z0.b
+// CHECK-INST: scvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3000 <unknown>
+
+scvtf z31.h, z31.b
+// CHECK-INST: scvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x33,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c33ff <unknown>
+
+scvtf z0.s, z0.h
+// CHECK-INST: scvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3000 <unknown>
+
+scvtf z31.s, z31.h
+// CHECK-INST: scvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x33,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c33ff <unknown>
+
+scvtf z0.d, z0.s
+// CHECK-INST: scvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3000 <unknown>
+
+scvtf z31.d, z31.s
+// CHECK-INST: scvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x33,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc33ff <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (top, unpredicated)
+
+scvtflt z0.h, z0.b
+// CHECK-INST: scvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x38,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3800 <unknown>
+
+scvtflt z31.h, z31.b
+// CHECK-INST: scvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3bff <unknown>
+
+scvtflt z0.s, z0.h
+// CHECK-INST: scvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x38,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3800 <unknown>
+
+scvtflt z31.s, z31.h
+// CHECK-INST: scvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3bff <unknown>
+
+scvtflt z0.d, z0.s
+// CHECK-INST: scvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x38,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3800 <unknown>
+
+scvtflt z31.d, z31.s
+// CHECK-INST: scvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3b,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3bff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (bottom, unpredicated)
+
+ucvtf z0.h, z0.b
+// CHECK-INST: ucvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3400 <unknown>
+
+ucvtf z31.h, z31.b
+// CHECK-INST: ucvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x37,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c37ff <unknown>
+
+ucvtf z0.s, z0.h
+// CHECK-INST: ucvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3400 <unknown>
+
+ucvtf z31.s, z31.h
+// CHECK-INST: ucvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x37,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c37ff <unknown>
+
+ucvtf z0.d, z0.s
+// CHECK-INST: ucvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3400 <unknown>
+
+ucvtf z31.d, z31.s
+// CHECK-INST: ucvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x37,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc37ff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (top, unpredicated)
+
+ucvtflt z0.h, z0.b
+// CHECK-INST: ucvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3c00 <unknown>
+
+ucvtflt z31.h, z31.b
+// CHECK-INST: ucvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3fff <unknown>
+
+ucvtflt z0.s, z0.h
+// CHECK-INST: ucvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3c00 <unknown>
+
+ucvtflt z31.s, z31.h
+// CHECK-INST: ucvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3fff <unknown>
+
+ucvtflt z0.d, z0.s
+// CHECK-INST: ucvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x3c,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3c00 <unknown>
+
+ucvtflt z31.d, z31.s
+// CHECK-INST: ucvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3f,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
new file mode 100644
index 0000000..0a12cf8
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch armv9-a+sve2p3
+.arch armv9-a+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
new file mode 100644
index 0000000..1af6245
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch_extension sve2p3
+.arch_extension nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
new file mode 100644
index 0000000..f3dac04
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.cpu generic+sve2p3
+.cpu generic+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
new file mode 100644
index 0000000..3eb3792
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
@@ -0,0 +1,137 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sdot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid register range and index
+
+sdot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sdot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: udot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot.s b/llvm/test/MC/AArch64/SVE2p3/dot.s
new file mode 100644
index 0000000..01021b38
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot.s
@@ -0,0 +1,173 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+sdot z0.h, z0.b, z0.b
+// CHECK-INST: sdot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x00,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400000 <unknown>
+
+sdot z10.h, z10.b, z10.b
+// CHECK-INST: sdot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x01,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a014a <unknown>
+
+sdot z21.h, z21.b, z21.b
+// CHECK-INST: sdot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x02,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445502b5 <unknown>
+
+sdot z31.h, z31.b, z31.b
+// CHECK-INST: sdot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x03,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x00,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420020 <unknown>
+
+// sdot indexed
+
+sdot z0.h, z0.b, z0.b[0]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200000 <unknown>
+
+sdot z31.h, z31.b, z7.b[0]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442703ff <unknown>
+
+sdot z0.h, z0.b, z0.b[1]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280000 <unknown>
+
+sdot z31.h, z31.b, z7.b[1]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f03ff <unknown>
+
+sdot z0.h, z0.b, z0.b[7]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780000 <unknown>
+
+sdot z31.h, z31.b, z7.b[7]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x00,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220020 <unknown>
+
+// udot
+
+udot z0.h, z0.b, z0.b
+// CHECK-INST: udot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x04,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400400 <unknown>
+
+udot z10.h, z10.b, z10.b
+// CHECK-INST: udot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x05,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a054a <unknown>
+
+udot z21.h, z21.b, z21.b
+// CHECK-INST: udot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x06,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445506b5 <unknown>
+
+udot z31.h, z31.b, z31.b
+// CHECK-INST: udot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x07,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x04,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420420 <unknown>
+
+// udot indexed
+
+udot z0.h, z0.b, z0.b[0]
+// CHECK-INST: udot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200400 <unknown>
+
+udot z31.h, z31.b, z7.b[0]
+// CHECK-INST: udot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442707ff <unknown>
+
+udot z0.h, z0.b, z0.b[1]
+// CHECK-INST: udot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280400 <unknown>
+
+udot z31.h, z31.b, z7.b[1]
+// CHECK-INST: udot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f07ff <unknown>
+
+udot z0.h, z0.b, z0.b[7]
+// CHECK-INST: udot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780400 <unknown>
+
+udot z31.h, z31.b, z7.b[7]
+// CHECK-INST: udot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x04,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220420 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
new file mode 100644
index 0000000..21b4df9
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
@@ -0,0 +1,70 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6.s b/llvm/test/MC/AArch64/SVE2p3/luti6.s
new file mode 100644
index 0000000..848091c
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6.s
@@ -0,0 +1,115 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit)
+
+luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x00,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac00 <unknown>
+
+luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac0a <unknown>
+
+luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x15,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac15 <unknown>
+
+luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac1f <unknown>
+
+luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafe0 <unknown>
+
+luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xea,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafea <unknown>
+
+luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453faff5 <unknown>
+
+luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xff,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafff <unknown>
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit)
+
+luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x00,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac00 <unknown>
+
+luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac0a <unknown>
+
+luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x15,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac15 <unknown>
+
+luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac1f <unknown>
+
+luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafe0 <unknown>
+
+luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xea,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafea <unknown>
+
+luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffaff5 <unknown>
+
+luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xff,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
new file mode 100644
index 0000000..a4dbae2
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
@@ -0,0 +1,343 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn.s b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
new file mode 100644
index 0000000..31c87cf
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
@@ -0,0 +1,255 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right narrow by immediate and interleave
+
+sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x28,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2bdf <unknown>
+
+sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x28,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xad,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ad2bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right unsigned narrow by immediate and interleave
+
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x08,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0bdf <unknown>
+
+sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x08,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right narrow by immediate and interleave
+
+sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xd5,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03d5 <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03df <unknown>
+
+sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x0a,0x00,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a8000a <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a803df <unknown>
+
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0000 <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf0000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf03df <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x00,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b00000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b003df <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right unsigned narrow by immediate and interleave
+
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af23df <unknown>
+
+sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x20,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a823df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf2000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf23df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x20,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b02000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b023df <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating rounding shift right narrow by immediate and interleave
+
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x38,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3bdf <unknown>
+
+uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x38,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83bdf <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating shift right narrow by immediate and interleave
+
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af1000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af13df <unknown>
+
+uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x10,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a81000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a813df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf1000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf13df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x10,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b01000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b013df <unknown>
diff --git a/llvm/test/MC/AArch64/armv8.4a-mpam.s b/llvm/test/MC/AArch64/armv8.4a-mpam.s
index 14787e6..7469227 100644
--- a/llvm/test/MC/AArch64/armv8.4a-mpam.s
+++ b/llvm/test/MC/AArch64/armv8.4a-mpam.s
@@ -1,6 +1,4 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s 2> %t | FileCheck %s --check-prefix=CHECK
-// RUN: FileCheck --check-prefix=CHECK-RO < %t %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 
 //------------------------------------------------------------------------------
 // ARMV8.4-A MPAM Extensions
@@ -56,9 +54,6 @@ mrs x0, MPAMIDR_EL1
 //CHECK:  msr MPAMVPM6_EL2, x0        // encoding: [0xc0,0xa6,0x1c,0xd5]
 //CHECK:  msr MPAMVPM7_EL2, x0        // encoding: [0xe0,0xa6,0x1c,0xd5]
 
-//CHECK-RO: error: expected writable system register or pstate
-//CHECK-RO: msr MPAMIDR_EL1, x0
-//CHECK-RO:     ^
 
 //CHECK:  mrs x0, MPAM0_EL1           // encoding: [0x20,0xa5,0x38,0xd5]
 //CHECK:  mrs x0, MPAM1_EL1           // encoding: [0x00,0xa5,0x38,0xd5]
@@ -77,100 +72,4 @@ mrs x0, MPAMIDR_EL1
 //CHECK:  mrs x0, MPAMVPM7_EL2        // encoding: [0xe0,0xa6,0x3c,0xd5]
 //CHECK:  mrs x0, MPAMIDR_EL1         // encoding: [0x80,0xa4,0x38,0xd5]
 
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM0_EL1, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL1, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM2_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM3_EL3, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL12, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMHCR_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPMV_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM0_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM1_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM2_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM3_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM4_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM5_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM6_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM7_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMIDR_EL1, x0
-//CHECK-ERROR:     ^
 
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM0_EL1
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL1
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM2_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM3_EL3
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL12
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMHCR_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPMV_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM0_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM1_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM2_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM3_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM4_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM5_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM6_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM7_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMIDR_EL1
-//CHECK-ERROR:         ^
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
new file mode 100644
index 0000000..cffee7d
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+gcie -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// FEAT_GCIE instructions
+//------------------------------------------------------------------------------
+
+gsb
+// CHECK-ERROR: error: invalid operand for GSB instruction
+
+gicr
+// CHECK-ERROR: error: expected register operand
+
+gicr x3, foo
+// CHECK-ERROR: error: invalid operand for GICR instruction
+
+gic cdaff
+// CHECK-ERROR: error: specified gic op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie.s b/llvm/test/MC/AArch64/armv9.7a-gcie.s
new file mode 100644
index 0000000..4fd5d25
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie.s
@@ -0,0 +1,985 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN:        | llvm-objdump -d --mattr=+gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN:        | llvm-objdump -d --mattr=-gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+gcie -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_GCIE Extensions
+//------------------------------------------------------------------------------
+
+// CPU Interface Registers MRS Instruction - Encodings Checked
+MRS x3, ICC_APR_EL1
+// CHECK-INST:    mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICC_APR_EL3
+// CHECK-INST:    mrs x3, ICC_APR_EL3
+// CHECK-ENCODING: [0x03,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec803
+
+MRS x3, ICC_CR0_EL1
+// CHECK-INST:    mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICC_CR0_EL3
+// CHECK-INST:    mrs x3, ICC_CR0_EL3
+// CHECK-ENCODING: [0x03,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec903
+
+MRS x3, ICC_DOMHPPIR_EL3
+// CHECK-INST:    mrs x3, ICC_DOMHPPIR_EL3
+// CHECK-ENCODING: [0x43,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec843
+
+MRS x3, ICC_HAPR_EL1
+// CHECK-INST:    mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICC_HPPIR_EL1
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICC_HPPIR_EL3
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL3
+// CHECK-ENCODING: [0x23,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec923
+
+MRS x3, ICC_IAFFIDR_EL1
+// CHECK-INST:    mrs x3, ICC_IAFFIDR_EL1
+// CHECK-ENCODING: [0xa3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538caa3
+
+MRS x3, ICC_ICSR_EL1
+// CHECK-INST:    mrs x3, ICC_ICSR_EL1
+// CHECK-ENCODING: [0x83,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca83
+
+MRS x3, ICC_IDR0_EL1
+// CHECK-INST:    mrs x3, ICC_IDR0_EL1
+// CHECK-ENCODING: [0x43,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca43
+
+MRS x3, ICC_PCR_EL1
+// CHECK-INST:    mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+MRS x3, ICC_PCR_EL3
+// CHECK-INST:    mrs x3, ICC_PCR_EL3
+// CHECK-ENCODING: [0x23,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec823
+
+MRS x3, ICC_SRE_EL1
+// CHECK-INST:    mrs x3, ICC_SRE_EL1
+// CHECK-ENCODING: [0xa3,0xcc,0x38,0xd5]
+// CHECK-UNKNOWN: d538cca3
+
+// -----------------------------------------------
+MSR ICC_APR_EL1, x3
+// CHECK-INST:    msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICC_APR_EL3, x3
+// CHECK-INST:    msr ICC_APR_EL3, x3
+// CHECK-ENCODING: [0x03,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec803
+
+MSR ICC_CR0_EL1, x3
+// CHECK-INST:    msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICC_CR0_EL3, x3
+// CHECK-INST:    msr ICC_CR0_EL3, x3
+// CHECK-ENCODING: [0x03,0xc9,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec903
+
+MSR ICC_ICSR_EL1, x3
+// CHECK-INST:    msr ICC_ICSR_EL1, x3
+// CHECK-ENCODING: [0x83,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518ca83
+
+MSR ICC_PCR_EL1, x3
+// CHECK-INST:    msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+MSR ICC_PCR_EL3, x3
+// CHECK-INST:    msr ICC_PCR_EL3, x3
+// CHECK-ENCODING: [0x23,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec823
+
+
+// -----------------------------------------------
+// Virtual CPU Registers MRS Instructions
+
+// The specification says:
+//   "Each ICC_system register that is accessible at EL1 and higher and whose state
+//    is specific to the Virtual Interrupt Domain, has a corresponding virtual
+//    ICV_register. The ICV_registers are accessed using the same system register
+//    encodings as their ICC_counterparts."
+//
+// So expect ICC_* encodings here, not ICV_* encodings
+
+MRS x3, ICV_APR_EL1
+// CHECK-INST:    mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICV_CR0_EL1
+// CHECK-INST:    mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICV_HAPR_EL1
+// CHECK-INST:    mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICV_HPPIR_EL1
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICV_PCR_EL1
+// CHECK-INST:    mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+
+// -----------------------------------------------
+// Likewise here, expect ICC_* encodings here, not ICV_* encodings
+MSR ICV_APR_EL1, x3
+// CHECK-INST:    msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICV_CR0_EL1, x3
+// CHECK-INST:    msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICV_PCR_EL1, x3
+// CHECK-INST:    msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+// -----------------------------------------------
+// PPI Registers MRS Instructions - Encodings Checked
+MRS x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-ENCODING: [0x03,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd03
+
+MRS x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-ENCODING: [0x23,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd23
+
+MRS x3, ICC_PPI_CPENDR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CPENDR0_EL1
+// CHECK-ENCODING: [0x83,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd83
+
+MRS x3, ICC_PPI_CPENDR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CPENDR1_EL1
+// CHECK-ENCODING: [0xa3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cda3
+
+MRS x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-ENCODING: [0x83,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec883
+
+MRS x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-ENCODING: [0xa3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8a3
+
+MRS x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-ENCODING: [0xc3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8c3
+
+MRS x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-ENCODING: [0xe3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8e3
+
+MRS x3, ICC_PPI_ENABLER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_ENABLER0_EL1
+// CHECK-ENCODING: [0xc3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cac3
+
+MRS x3, ICC_PPI_ENABLER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_ENABLER1_EL1
+// CHECK-ENCODING: [0xe3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cae3
+
+MRS x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-ENCODING: [0x03,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce03
+
+MRS x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-ENCODING: [0x23,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce23
+
+MRS x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-ENCODING: [0x43,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce43
+
+MRS x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-ENCODING: [0x63,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce63
+
+MRS x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-ENCODING: [0x83,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce83
+
+MRS x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-ENCODING: [0xa3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cea3
+
+MRS x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-ENCODING: [0xc3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cec3
+
+MRS x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-ENCODING: [0xe3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cee3
+
+MRS x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-ENCODING: [0x03,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf03
+
+MRS x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-ENCODING: [0x23,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf23
+
+MRS x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-ENCODING: [0x43,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf43
+
+MRS x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-ENCODING: [0x63,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf63
+
+MRS x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-ENCODING: [0x83,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf83
+
+MRS x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-ENCODING: [0xa3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfa3
+
+MRS x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-ENCODING: [0xc3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfc3
+
+MRS x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-ENCODING: [0xe3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfe3
+
+MRS x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-ENCODING: [0x43,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd43
+
+MRS x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-ENCODING: [0x63,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd63
+
+MRS x3, ICC_PPI_SPENDR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SPENDR0_EL1
+// CHECK-ENCODING: [0xc3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cdc3
+
+MRS x3, ICC_PPI_SPENDR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SPENDR1_EL1
+// CHECK-ENCODING: [0xe3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cde3
+
+MRS x3, ICC_PPI_HMR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_HMR0_EL1
+// CHECK-ENCODING: [0x03,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca03
+
+MRS x3, ICC_PPI_HMR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_HMR1_EL1
+// CHECK-ENCODING: [0x23,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca23
+
+// -----------------------------------------------
+// MSR PPI Registers Instructions
+MSR ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x03,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd03
+
+MSR ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x23,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd23
+
+MSR ICC_PPI_CPENDR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CPENDR0_EL1, x3
+// CHECK-ENCODING: [0x83,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd83
+
+MSR ICC_PPI_CPENDR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CPENDR1_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cda3
+
+MSR ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec883
+
+MSR ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-ENCODING: [0xa3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8a3
+
+MSR ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-ENCODING: [0xc3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8c3
+
+MSR ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-ENCODING: [0xe3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8e3
+
+MSR ICC_PPI_ENABLER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_ENABLER0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cac3
+
+MSR ICC_PPI_ENABLER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_ENABLER1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cae3
+
+MSR ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-ENCODING: [0x03,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce03
+
+MSR ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-ENCODING: [0x23,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce23
+
+MSR ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-ENCODING: [0x43,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce43
+
+MSR ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-ENCODING: [0x63,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce63
+
+MSR ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-ENCODING: [0x83,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce83
+
+MSR ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-ENCODING: [0xa3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cea3
+
+MSR ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-ENCODING: [0xc3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cec3
+
+MSR ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-ENCODING: [0xe3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cee3
+
+MSR ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-ENCODING: [0x03,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf03
+
+MSR ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-ENCODING: [0x23,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf23
+
+MSR ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-ENCODING: [0x43,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf43
+
+MSR ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-ENCODING: [0x63,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf63
+
+MSR ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-ENCODING: [0x83,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf83
+
+MSR ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfa3
+
+MSR ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfc3
+
+MSR ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfe3
+
+MSR ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x43,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd43
+
+MSR ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x63,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd63
+
+MSR ICC_PPI_SPENDR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SPENDR0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cdc3
+
+MSR ICC_PPI_SPENDR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SPENDR1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cde3
+
+// -----------------------------------------------
+// Hypervisor Control Register MRS Instructions
+MRS x3, ICH_APR_EL2
+// CHECK-INST:    mrs x3, ICH_APR_EL2
+// CHECK-ENCODING: [0x83,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc883
+
+MRS x3, ICH_CONTEXTR_EL2
+// CHECK-INST:    mrs x3, ICH_CONTEXTR_EL2
+// CHECK-ENCODING: [0xc3,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccbc3
+
+MRS x3, ICH_HFGITR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGITR_EL2
+// CHECK-ENCODING: [0xe3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9e3
+
+MRS x3, ICH_HFGRTR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGRTR_EL2
+// CHECK-ENCODING: [0x83,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc983
+
+MRS x3, ICH_HFGWTR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGWTR_EL2
+// CHECK-ENCODING: [0xc3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9c3
+
+MRS x3, ICH_HPPIR_EL2
+// CHECK-INST:    mrs x3, ICH_HPPIR_EL2
+// CHECK-ENCODING: [0xa3,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc8a3
+
+MRS x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-ENCODING: [0xc3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccac3
+
+MRS x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-ENCODING: [0xe3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccae3
+
+MRS x3, ICH_PPI_DVIR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_DVIR0_EL2
+// CHECK-ENCODING: [0x03,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca03
+
+MRS x3, ICH_PPI_DVIR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_DVIR1_EL2
+// CHECK-ENCODING: [0x23,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca23
+
+MRS x3, ICH_PPI_ENABLER0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ENABLER0_EL2
+// CHECK-ENCODING: [0x43,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca43
+
+MRS x3, ICH_PPI_ENABLER1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ENABLER1_EL2
+// CHECK-ENCODING: [0x63,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca63
+
+MRS x3, ICH_PPI_PENDR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PENDR0_EL2
+// CHECK-ENCODING: [0x83,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca83
+
+MRS x3, ICH_PPI_PENDR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PENDR1_EL2
+// CHECK-ENCODING: [0xa3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccaa3
+
+MRS x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-ENCODING: [0x03,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce03
+
+MRS x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-ENCODING: [0x23,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce23
+
+MRS x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-ENCODING: [0x43,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce43
+
+MRS x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-ENCODING: [0x63,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce63
+
+MRS x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-ENCODING: [0x83,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce83
+
+MRS x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-ENCODING: [0xa3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccea3
+
+MRS x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-ENCODING: [0xc3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccec3
+
+MRS x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-ENCODING: [0xe3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccee3
+
+MRS x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-ENCODING: [0x03,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf03
+
+MRS x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-ENCODING: [0x23,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf23
+
+MRS x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-ENCODING: [0x43,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf43
+
+MRS x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-ENCODING: [0x63,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf63
+
+MRS x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-ENCODING: [0x83,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf83
+
+MRS x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-ENCODING: [0xa3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfa3
+
+MRS x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-ENCODING: [0xc3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfc3
+
+MRS x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-ENCODING: [0xe3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfe3
+
+MRS x3, ICH_VCTLR_EL2
+// CHECK-INST:    mrs x3, ICH_VCTLR_EL2
+// CHECK-ENCODING: [0x83,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccb83
+
+// -----------------------------------------------
+// Hypervisor Control Register MSR Instructions
+MSR ICH_APR_EL2, x3
+// CHECK-INST:    msr ICH_APR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc883
+
+MSR ICH_CONTEXTR_EL2, x3
+// CHECK-INST:    msr ICH_CONTEXTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccbc3
+
+MSR ICH_HFGITR_EL2, x3
+// CHECK-INST:    msr ICH_HFGITR_EL2, x3
+// CHECK-ENCODING: [0xe3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9e3
+
+MSR ICH_HFGRTR_EL2, x3
+// CHECK-INST:    msr ICH_HFGRTR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc983
+
+MSR ICH_HFGWTR_EL2, x3
+// CHECK-INST:    msr ICH_HFGWTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9c3
+
+MSR ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-ENCODING: [0xc3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccac3
+
+MSR ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-ENCODING: [0xe3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccae3
+
+MSR ICH_PPI_DVIR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_DVIR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca03
+
+MSR ICH_PPI_DVIR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_DVIR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca23
+
+MSR ICH_PPI_ENABLER0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ENABLER0_EL2, x3
+// CHECK-ENCODING: [0x43,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca43
+
+MSR ICH_PPI_ENABLER1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ENABLER1_EL2, x3
+// CHECK-ENCODING: [0x63,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca63
+
+MSR ICH_PPI_PENDR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PENDR0_EL2, x3
+// CHECK-ENCODING: [0x83,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca83
+
+MSR ICH_PPI_PENDR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PENDR1_EL2, x3
+// CHECK-ENCODING: [0xa3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccaa3
+
+MSR ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce03
+
+MSR ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce23
+
+MSR ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-ENCODING: [0x43,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce43
+
+MSR ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-ENCODING: [0x63,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce63
+
+MSR ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-ENCODING: [0x83,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce83
+
+MSR ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-ENCODING: [0xa3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccea3
+
+MSR ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-ENCODING: [0xc3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccec3
+
+MSR ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-ENCODING: [0xe3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccee3
+
+MSR ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-ENCODING: [0x03,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf03
+
+MSR ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-ENCODING: [0x23,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf23
+
+MSR ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-ENCODING: [0x43,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf43
+
+MSR ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-ENCODING: [0x63,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf63
+
+MSR ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-ENCODING: [0x83,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf83
+
+MSR ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfa3
+
+MSR ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfc3
+
+MSR ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfe3
+
+MSR ICH_VCTLR_EL2, x3
+// CHECK-INST:    msr ICH_VCTLR_EL2, x3
+// CHECK-ENCODING: [0x83,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccb83
+
+// -----------------------------------------------
+// FEAT_GCIE Instructions
+// Current Interrupt Domain
+GIC CDAFF, x3
+// CHECK-INST:    gic cdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c163 sys #0, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC cdaff requires: gcie
+
+GIC CDDI, x3
+// CHECK-INST:    gic cddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c203 sys #0, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC cddi requires: gcie
+
+GIC CDDIS, x3
+// CHECK-INST:    gic cddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c103 sys #0, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC cddis requires: gcie
+
+GIC CDEN, x3
+// CHECK-INST:    gic cden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c123 sys #0, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC cden requires: gcie
+
+GIC CDEOI, x3
+// CHECK-INST:    gic cdeoi, x3
+// CHECK-ENCODING: [0xe3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1e3 sys #0, c12, c1, #7, x3
+// CHECK-ERROR: error: GIC cdeoi requires: gcie
+
+GIC CDHM, x3
+// CHECK-INST:    gic cdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c223 sys #0, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC cdhm requires: gcie
+
+GIC CDPEND, x3
+// CHECK-INST:    gic cdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c183 sys #0, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC cdpend requires: gcie
+
+GIC CDPRI, x3
+// CHECK-INST:    gic cdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c143 sys #0, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC cdpri requires: gcie
+
+GIC CDRCFG, x3
+// CHECK-INST:    gic cdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1a3 sys #0, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC cdrcfg requires: gcie
+
+GICR x3, CDIA
+// CHECK-INST:    gicr x3, cdia
+// CHECK-ENCODING: [0x03,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c303 sysl x3, #0, c12, c3, #0
+// CHECK-ERROR: error: GICR cdia requires: gcie
+
+GICR x3, CDNMIA
+// CHECK-INST:    gicr x3, cdnmia
+// CHECK-ENCODING: [0x23,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c323 sysl x3, #0, c12, c3, #1
+// CHECK-ERROR: error: GICR cdnmia requires: gcie
+
+// -----------------------------------------------
+// Virtual Interrupt Domain
+GIC VDAFF, x3
+// CHECK-INST:    gic vdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc163 sys #4, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC vdaff requires: gcie
+
+GIC VDDI, x3
+// CHECK-INST:    gic vddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc203 sys #4, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC vddi requires: gcie
+
+GIC VDDIS, x3
+// CHECK-INST:    gic vddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc103 sys #4, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC vddis requires: gcie
+
+GIC VDEN, x3
+// CHECK-INST:    gic vden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc123 sys #4, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC vden requires: gcie
+
+GIC VDHM, x3
+// CHECK-INST:    gic vdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc223 sys #4, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC vdhm requires: gcie
+
+GIC VDPEND, x3
+// CHECK-INST:    gic vdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc183 sys #4, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC vdpend requires: gcie
+
+GIC VDPRI, x3
+// CHECK-INST:    gic vdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc143 sys #4, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC vdpri requires: gcie
+
+GIC VDRCFG, x3
+// CHECK-INST:    gic vdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc1a3 sys #4, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC vdrcfg requires: gcie
+
+// -----------------------------------------------
+// Logical Interrupt Domain
+GIC LDAFF, x3
+// CHECK-INST:    gic ldaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec163 sys #6, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC ldaff requires: gcie
+
+GIC LDDI, x3
+// CHECK-INST:    gic lddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec203 sys #6, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC lddi requires: gcie
+
+GIC LDDIS, x3
+// CHECK-INST:    gic lddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec103 sys #6, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC lddis requires: gcie
+
+GIC LDEN, x3
+// CHECK-INST:    gic lden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec123 sys #6, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC lden requires: gcie
+
+GIC LDHM, x3
+// CHECK-INST:    gic ldhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec223 sys #6, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC ldhm requires: gcie
+
+GIC LDPEND, x3
+// CHECK-INST:    gic ldpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec183 sys #6, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC ldpend requires: gcie
+
+GIC LDPRI, x3
+// CHECK-INST:    gic ldpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec143 sys #6, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC ldpri requires: gcie
+
+GIC LDRCFG, x3
+// CHECK-INST:    gic ldrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec1a3 sys #6, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC ldrcfg requires: gcie
+
+// -----------------------------------------------
+// GIC Synchronization Barrier Instructions
+GSB SYS
+// CHECK-INST:    gsb sys
+// CHECK-ENCODING: [0x1f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c01f sys #0, c12, c0, #0
+// CHECK-ERROR: error: GSB sys requires: gcie
+
+GSB ACK
+// CHECK-INST:    gsb ack
+// CHECK-ENCODING: [0x3f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c03f sys #0, c12, c0, #1
+// CHECK-ERROR: error: GSB ack requires: gcie
diff --git a/llvm/test/MC/AArch64/armv9.7a-memsys.s b/llvm/test/MC/AArch64/armv9.7a-memsys.s
new file mode 100644
index 0000000..228c71e
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-memsys.s
@@ -0,0 +1,140 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN:        | llvm-objdump -d --mattr=+cmh,+lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN:        | llvm-objdump -d --mattr=-cmh,-lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+cmh,+lscp -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A Contention Management Hints (FEAT_CMH).
+
+shuh
+// CHECK-INST: shuh
+// CHECK-ENCODING: encoding: [0x5f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503265f hint    #50
+
+shuh ph
+// CHECK-INST: shuh  ph
+// CHECK-ENCODING: encoding: [0x7f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503267f hint    #51
+
+stcph
+// CHECK-INST: stcph
+// CHECK-ENCODING: [0x9f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503269f hint    #52
+
+ldap x0, x1, [x2]
+// CHECK-INST: ldap    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldap x0, x1, [x2, #0]
+// CHECK-INST: ldap    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldapp x0, x1, [x2]
+// CHECK-INST: ldapp   x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+ldapp x0, x1, [x2, #0]
+// CHECK-INST: ldapp   x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+stlp x0, x1, [x2, #0]
+// CHECK-INST: stlp    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+stlp x0, x1, [x2]
+// CHECK-INST: stlp    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+mrs x3, VTLBID0_EL2
+// CHECK-INST: mrs	x3, VTLBID0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2803
+mrs x3, VTLBID1_EL2
+// CHECK-INST: mrs	x3, VTLBID1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2823
+mrs x3, VTLBID2_EL2
+// CHECK-INST: mrs	x3, VTLBID2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2843
+mrs x3, VTLBID3_EL2
+// CHECK-INST: mrs	x3, VTLBID3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2863
+mrs x3, VTLBIDOS0_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2903
+mrs x3, VTLBIDOS1_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2923
+mrs x3, VTLBIDOS2_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2943
+mrs x3, VTLBIDOS3_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2963
+mrs x3, TLBIDIDR_EL1
+// CHECK-INST: mrs	x3, TLBIDIDR_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xa4,0x38,0xd5]
+// CHECK-UNKNOWN: d538a4c3
+
+msr VTLBID0_EL2, x3
+// CHECK-INST: msr	VTLBID0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2803
+msr VTLBID1_EL2, x3
+// CHECK-INST: msr	VTLBID1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2823
+msr VTLBID2_EL2, x3
+// CHECK-INST: msr	VTLBID2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2843
+msr VTLBID3_EL2, x3
+// CHECK-INST: msr	VTLBID3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2863
+msr VTLBIDOS0_EL2, x3
+// CHECK-INST: msr	VTLBIDOS0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2903
+msr VTLBIDOS1_EL2, x3
+// CHECK-INST: msr	VTLBIDOS1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2923
+msr VTLBIDOS2_EL2, x3
+// CHECK-INST: msr	VTLBIDOS2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2943
+msr VTLBIDOS3_EL2, x3
+// CHECK-INST: msr	VTLBIDOS3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2963
+
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
new file mode 100644
index 0000000..54fdc23
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mpamv2 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+mlbi alle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vmalle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vpide1
+// CHECK-ERROR: error: specified mlbi op requires a register
+
+mlbi vpmge1
+// CHECK-ERROR: error: specified mlbi op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
new file mode 100644
index 0000000..b8b21e96
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
@@ -0,0 +1,126 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN:        | llvm-objdump -d --mattr=+mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN:        | llvm-objdump -d --mattr=-mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mpamv2 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+msr MPAMCTL_EL1, x0
+// CHECK-INST:    msr MPAMCTL_EL1, x0
+// CHECK-ENCODING: [0x40,0xa5,0x18,0xd5]
+// CHECK-UNKNOWN: d518a540
+
+msr MPAMCTL_EL12, x0
+// CHECK-INST:   msr MPAMCTL_EL12, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1d,0xd5]
+// CHECK-UNKNOWN: d51da540
+
+msr MPAMCTL_EL2, x0
+// CHECK-INST:    msr     MPAMCTL_EL2, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca540
+
+msr MPAMCTL_EL3, x0
+// CHECK-INST:    msr     MPAMCTL_EL3, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea540
+
+msr MPAMVIDCR_EL2, x0
+// CHECK-INST:    msr     MPAMVIDCR_EL2, x0
+// CHECK-ENCODING: [0x00,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca700
+
+msr MPAMVIDSR_EL2, x0
+// CHECK-INST:    msr     MPAMVIDSR_EL2, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca720
+
+msr MPAMVIDSR_EL3, x0
+// CHECK-INST:    msr     MPAMVIDSR_EL3, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea720
+
+
+mrs x0, MPAMCTL_EL1
+// CHECK-INST:        mrs     x0, MPAMCTL_EL1
+// CHECK-ENCODING: [0x40,0xa5,0x38,0xd5]
+// CHECK-UNKNOWN: d538a540
+
+mrs x0, MPAMCTL_EL12
+// CHECK-INST:   mrs     x0, MPAMCTL_EL12
+// CHECK-ENCODING: [0x40,0xa5,0x3d,0xd5]
+// CHECK-UNKNOWN: d53da540
+
+mrs x0, MPAMCTL_EL2
+// CHECK-INST:   mrs     x0, MPAMCTL_EL2
+// CHECK-ENCODING: [0x40,0xa5,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca540
+
+mrs x0, MPAMCTL_EL3
+// CHECK-INST:   mrs     x0, MPAMCTL_EL3
+// CHECK-ENCODING: [0x40,0xa5,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea540
+
+mrs x0, MPAMVIDCR_EL2
+// CHECK-INST:   mrs     x0, MPAMVIDCR_EL2
+// CHECK-ENCODING: [0x00,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca700
+
+mrs x0, MPAMVIDSR_EL2
+// CHECK-INST:   mrs     x0, MPAMVIDSR_EL2
+// CHECK-ENCODING: [0x20,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca720
+
+mrs x0, MPAMVIDSR_EL3
+// CHECK-INST:   mrs     x0, MPAMVIDSR_EL3
+// CHECK-ENCODING: [0x20,0xa7,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea720
+
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2_VID Extensions
+//------------------------------------------------------------------------------
+
+mlbi vmalle1
+// CHECK-INST:    mlbi vmalle1
+// CHECK-ENCODING: [0xbf,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70bf sys	#4, c7, c0, #5
+// CHECK-ERROR: error: MLBI VMALLE1 requires: mpamv2
+
+mlbi vpide1, x0
+// CHECK-INST:    mlbi vpide1, x0
+// CHECK-ENCODING: [0xc0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70c0 sys	#4, c7, c0, #6, x0
+// CHECK-ERROR: error: MLBI VPIDE1 requires: mpamv2
+
+mlbi vpmge1, x0
+// CHECK-INST:    mlbi vpmge1, x0
+// CHECK-ENCODING: [0xe0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70e0 sys	#4, c7, c0, #7, x0
+// CHECK-ERROR: error: MLBI VPMGE1 requires: mpamv2
+
+// Check that invalid encodings are rendered as SYS aliases
+// [0x9f,0x70,0x0c,0xd5] -> mlbi alle1
+// [0x9e,0x70,0x0c,0xd5] -> sys #4, c7, c0, #4, x30
+
+mlbi alle1
+// CHECK-INST:    mlbi alle1
+// CHECK-ENCODING: [0x9f,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709f sys	#4, c7, c0, #4
+// CHECK-ERROR: error: MLBI ALLE1 requires: mpamv2
+
+sys #4, c7, c0, #4, x30
+// CHECK-INST: sys #4, c7, c0, #4, x30
+// CHECK-ENCODING: [0x9e,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709e sys #4, c7, c0, #4, x30
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
new file mode 100644
index 0000000..dc2a290
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
@@ -0,0 +1,16 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mtetc -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-REQUIRES-MTETC
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC ZGBVA requires: mtetc
+
+dc gbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc.s b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
new file mode 100644
index 0000000..087b23b
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN:        | llvm-objdump -d --mattr=+mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN:        | llvm-objdump -d --mattr=-mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mtetc -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva, x0
+// CHECK-INST:    dc zgbva, x0
+// CHECK-ENCODING: [0xa0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74a0 sys #3, c7, c4, #5, x0
+// CHECK-ERROR: DC ZGBVA requires: mtetc
+
+dc gbva, x0
+// CHECK-INST:    dc gbva, x0
+// CHECK-ENCODING: [0xe0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74e0 sys #3, c7, c4, #7, x0
+// CHECK-ERROR: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
new file mode 100644
index 0000000..2440fd3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
@@ -0,0 +1,64 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+rme < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+tlbid,+rme < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-NO-REGISTER
+
+// Test without using +tlbid - no optional register operand allowed
+
+tlbi vmalle1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle2is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle3is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1os, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle2, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle3, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paallos, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paall, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid.s b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
new file mode 100644
index 0000000..1362bd3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
@@ -0,0 +1,84 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | llvm-objdump -d --mattr=+tlbid,+tlb-rmi,+tlbiw --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | llvm-objdump -d --mattr=-tlbid --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+tlbid,+tlb-rmi,+tlbiw -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A TLBI Domains (FEAT_TLBID)
+
+tlbi vmalle1is
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, xzr
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x31
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x5
+// CHECK-INST: tlbi vmalle1is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088305 sys	#0, c8, c3, #0, x5
+
+tlbi vmalle1os, x5
+// CHECK-INST: tlbi vmalle1os, x5
+// CHECK-ENCODING: encoding: [0x05,0x81,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088105 sys	#0, c8, c1, #0, x5
+
+tlbi alle1is, x5
+// CHECK-INST: tlbi alle1is, x5
+// CHECK-ENCODING: encoding: [0x85,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8385 sys	#4, c8, c3, #4, x5
+
+tlbi alle2is, x5
+// CHECK-INST: tlbi alle2is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8305 sys	#4, c8, c3, #0, x5
+
+tlbi alle3is, x5
+// CHECK-INST: tlbi alle3is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0e,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50e8305 sys	#6, c8, c3, #0, x5
+
+tlbi vmalls12e1is, x1
+// CHECK-INST: tlbi vmalls12e1is, x1
+// CHECK-ENCODING: encoding: [0xc1,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c83c1 sys	#4, c8, c3, #6, x1
+
+tlbi vmalls12e1os, x5
+// CHECK-INST: tlbi vmalls12e1os, x5
+// CHECK-ENCODING: encoding: [0xc5,0x81,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c81c5 sys	#4, c8, c1, #6, x5
+
+tlbi vmallws2e1is, x1
+// CHECK-INST: tlbi vmallws2e1is, x1
+// CHECK-ENCODING: encoding: [0x41,0x82,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8241 sys	#4, c8, c2, #2, x1
+
+tlbi vmallws2e1os, x1
+// CHECK-INST: tlbi vmallws2e1os, x1
+// CHECK-ENCODING: encoding: [0x41,0x85,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8541 sys	#4, c8, c5, #2, x1
diff --git a/llvm/test/MC/AArch64/neon-fdot-diagnostics.s b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
new file mode 100644
index 0000000..4f5f557
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
@@ -0,0 +1,59 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=f16f32dot 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand
+
+fdot v0.2s, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// fdot indexed
+
+fdot v0.2s, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fdot.s b/llvm/test/MC/AArch64/neon-fdot.s
new file mode 100644
index 0000000..c8a8e2f
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot.s
@@ -0,0 +1,147 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16f32dot -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fdot v0.2s, v0.4h, v0.4h
+// CHECK-INST: fdot v0.2s, v0.4h, v0.4h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e80fc00 <unknown>
+
+fdot v10.2s, v10.4h, v10.4h
+// CHECK-INST: fdot v10.2s, v10.4h, v10.4h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e8afd4a <unknown>
+
+fdot v31.2s, v31.4h, v31.4h
+// CHECK-INST: fdot v31.2s, v31.4h, v31.4h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e9fffff <unknown>
+
+fdot v0.4s, v0.8h, v0.8h
+// CHECK-INST: fdot v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e80fc00 <unknown>
+
+fdot v10.4s, v10.8h, v10.8h
+// CHECK-INST: fdot v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e8afd4a <unknown>
+
+fdot v31.4s, v31.8h, v31.8h
+// CHECK-INST: fdot v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e9fffff <unknown>
+
+// fdot indexed
+
+fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x00,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409000 <unknown>
+
+fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x0a,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40900a <unknown>
+
+fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x15,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409015 <unknown>
+
+fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x1f,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40901f <unknown>
+
+fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x40,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409140 <unknown>
+
+fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x4a,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40914a <unknown>
+
+fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x55,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409155 <unknown>
+
+fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x5f,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40915f <unknown>
+
+fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xa0,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aa0 <unknown>
+
+fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xaa,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aaa <unknown>
+
+fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xb5,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9ab5 <unknown>
+
+fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xbf,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9abf <unknown>
+
+fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xe0,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9be0 <unknown>
+
+fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xea,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bea <unknown>
+
+fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xf5,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bf5 <unknown>
+
+fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xff,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
new file mode 100644
index 0000000..ccc0742
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16f32mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.4b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4h, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4h, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4s, v0.8s, v0.8s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4s, v0.8s, v0.8s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4d, v0.8d, v0.8d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4d, v0.8d, v0.8d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
new file mode 100644
index 0000000..6b3d352
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm< %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16f32mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.4s, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0x40,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e40ec00 <unknown>
+
+fmmla v10.4s, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0x4a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e4aed4a <unknown>
+
+fmmla v21.4s, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.4s, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0x55,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e55eeb5 <unknown>
+
+fmmla v31.4s, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e5fefff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
new file mode 100644
index 0000000..7fc5373
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.8b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8b, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8s, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8s, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8d, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8d, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla.s b/llvm/test/MC/AArch64/neon-fmmla.s
new file mode 100644
index 0000000..f35c2fb
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm< %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.8h, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.8h, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0xc0,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ec0ec00 <unknown>
+
+fmmla v10.8h, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.8h, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0xca,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ecaed4a <unknown>
+
+fmmla v21.8h, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.8h, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0xd5,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ed5eeb5 <unknown>
+
+fmmla v31.8h, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.8h, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0xdf,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4edfefff <unknown>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
index 4f7ca47..358fe0b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -45,6 +45,10 @@ s_set_vgpr_msb 255
 // GFX1250: [0xff,0x00,0x86,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+s_set_vgpr_msb 0xffff
+// GFX1250: [0xff,0xff,0x86,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 s_monitor_sleep 1
 // GFX1250: s_monitor_sleep 1                       ; encoding: [0x01,0x00,0x84,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_err.s b/llvm/test/MC/AMDGPU/gfx1250_err.s
index 9d1131e..676eb48 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_err.s
@@ -1,15 +1,5 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1250-ERR --implicit-check-not=error: -strict-whitespace %s
 
-s_set_vgpr_msb -1
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb -1
-// GFX1250-ERR:                ^
-
-s_set_vgpr_msb 256
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb 256
-// GFX1250-ERR:                ^
-
 s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // GFX1250-ERR: s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
index ad22000..16eba25 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
@@ -65,35 +65,39 @@
 #CHECK:  mrs x0, MPAMVPM7_EL2
 #CHECK:  mrs x0, MPAMIDR_EL1
 
-#CHECK-NOV84:  msr S3_0_C10_C5_1, x0
-#CHECK-NOV84:  msr S3_0_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_6_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_5_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C4_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C4_1, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_1, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_2, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_3, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_4, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_5, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_6, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_7, x0
-#CHECK-NOV84:  mrs x0, S3_0_C10_C5_1
-#CHECK-NOV84:  mrs x0, S3_0_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_6_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_5_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C4_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C4_1
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_1
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_2
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_3
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_4
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_5
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_6
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_7
-#CHECK-NOV84:  mrs x0, S3_0_C10_C4_4
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: msr MPAM0_EL1, x0
+#CHECK-NOV84: msr MPAM1_EL1, x0
+#CHECK-NOV84: msr MPAM2_EL2, x0
+#CHECK-NOV84: msr MPAM3_EL3, x0
+#CHECK-NOV84: msr MPAM1_EL12, x0
+#CHECK-NOV84: msr MPAMHCR_EL2, x0
 
+#CHECK-NOV84:  msr MPAMVPMV_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM0_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM1_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM2_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM3_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM4_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM5_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM6_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM7_EL2, x0
+
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: mrs x0, MPAM0_EL1
+#CHECK-NOV84: mrs x0, MPAM1_EL1
+#CHECK-NOV84: mrs x0, MPAM2_EL2
+#CHECK-NOV84: mrs x0, MPAM3_EL3
+#CHECK-NOV84: mrs x0, MPAM1_EL12
+#CHECK-NOV84: mrs x0, MPAMHCR_EL2
+
+#CHECK-NOV84:  mrs x0, MPAMVPMV_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM0_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM1_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM2_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM3_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM4_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM5_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM6_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM7_EL2
+#CHECK-NOV84:  mrs x0, MPAMIDR_EL1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
index a8627d6..b84324b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -33,6 +33,9 @@
 # GFX1250: s_set_vgpr_msb 0xff ; encoding: [0xff,0x00,0x86,0xbf]
 0xff,0x00,0x86,0xbf
 
+# GFX1250: s_set_vgpr_msb 0xffff ; encoding: [0xff,0xff,0x86,0xbf]
+0xff,0xff,0x86,0xbf
+
 # GFX1250: s_monitor_sleep 0                       ; encoding: [0x00,0x00,0x84,0xbf]
 0x00,0x00,0x84,0xbf
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
new file mode 100644
index 0000000..46b0ebd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2IC1 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF2IC2 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @load_store_interleave_group_block_invar_cond(ptr noalias %data, ptr noalias %dst.0, ptr noalias %dst.1, i1 %c) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC1-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP2]], align 1
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    br label %[[EXIT:.*]]
+; VF2IC1:       [[EXIT]]:
+; VF2IC1-NEXT:    ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; VF2IC2:       [[PRED_STORE_IF6]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VF2IC2:       [[PRED_STORE_CONTINUE7]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; VF2IC2:       [[PRED_STORE_IF8]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; VF2IC2:       [[PRED_STORE_CONTINUE9]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2:       [[PRED_STORE_IF10]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2:       [[PRED_STORE_CONTINUE11]]:
+; VF2IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 2
+; VF2IC2-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; VF2IC2-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br label %[[EXIT:.*]]
+; VF2IC2:       [[EXIT]]:
+; VF2IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i8 1, ptr %dst.0
+  br label %loop.latch
+
+loop.latch:
+  %gep.dst.1 = getelementptr inbounds i8, ptr %dst.1, i64 %iv
+  store i8 0, ptr %gep.dst.1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @load_store_interleave_group_block_var_cond(ptr noalias %data, ptr %masks, ptr noalias %dst) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; VF2IC1-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF2IC1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP5]]
+; VF2IC1-NEXT:    store i8 1, ptr [[TMP6]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP8]]
+; VF2IC1-NEXT:    store i8 1, ptr [[TMP9]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    br label %[[EXIT:.*]]
+; VF2IC1:       [[EXIT]]:
+; VF2IC1-NEXT:    ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
+; VF2IC2-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP8]], align 1
+; VF2IC2-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP10:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP12]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP13]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP15]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP16]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP18]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP19]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP21]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP22]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br label %[[EXIT:.*]]
+; VF2IC2:       [[EXIT]]:
+; VF2IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  %gep.mask = getelementptr inbounds i8, ptr %masks, i64 %iv
+  %l.mask = load i8, ptr %gep.mask
+  %c = icmp eq i8 %l.mask, 0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i8 1, ptr %gep.mask
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index d1c0f64..113b052 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -638,7 +638,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
 !0 = !{!1, !3, !5, !7, !9, !11}
 !1 = !{!2, !"cold"}
 !2 = !{i64 1, i64 2, i64 3}
-!3 = !{!4, !"cold"}
+!3 = !{!4, !"cold", !13}
 !4 = !{i64 1, i64 2, i64 4}
 !5 = !{!6, !"notcold"}
 !6 = !{i64 1, i64 5, i64 6}
@@ -648,6 +648,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
 !10 = !{i64 1, i64 8, i64 9}
 !11 = !{!12, !"hot"}
 !12 = !{i64 1, i64 8, i64 10}  
+!13 = !{i64 123, i64 456}
 )IR");
 
   Function *Func = M->getFunction("test");
@@ -683,10 +684,25 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
     auto *StackId = mdconst::dyn_extract<ConstantInt>(StackMD->getOperand(0));
     EXPECT_EQ(StackId->getZExtValue(), 1u);
     StackId = mdconst::dyn_extract<ConstantInt>(StackMD->getOperand(1));
-    if (StackId->getZExtValue() == 2u)
+    if (StackId->getZExtValue() == 2u) {
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Cold);
-    else if (StackId->getZExtValue() == 5u)
+      // We should propagate the single context size info from the second cold
+      // context above onto the new merged/trimmed context.
+      ASSERT_EQ(MIB->getNumOperands(), 3u);
+      MDNode *ContextSizePair = dyn_cast<MDNode>(MIB->getOperand(2));
+      assert(ContextSizePair->getNumOperands() == 2);
+      EXPECT_EQ(
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(0))
+              ->getZExtValue(),
+          123u);
+      EXPECT_EQ(
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(1))
+              ->getZExtValue(),
+          456u);
+    } else if (StackId->getZExtValue() == 5u) {
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
+      ASSERT_EQ(MIB->getNumOperands(), 2u);
+    }
   }
 }
 
diff --git a/llvm/unittests/IR/ModuleTest.cpp b/llvm/unittests/IR/ModuleTest.cpp
index 36c35673..30eda73 100644
--- a/llvm/unittests/IR/ModuleTest.cpp
+++ b/llvm/unittests/IR/ModuleTest.cpp
@@ -103,6 +103,36 @@ TEST(ModuleTest, setModuleFlagInt) {
   EXPECT_EQ(Val2, A2->getZExtValue());
 }
 
+TEST(ModuleTest, setModuleFlagTwoMod) {
+  LLVMContext Context;
+  Module MA("MA", Context);
+  Module MB("MB", Context);
+  StringRef Key = "Key";
+  uint32_t Val1 = 1;
+  uint32_t Val2 = 2;
+
+  // Set a flag to MA
+  EXPECT_EQ(nullptr, MA.getModuleFlag(Key));
+  MA.setModuleFlag(Module::ModFlagBehavior::Error, Key, Val1);
+  auto A1 = mdconst::extract_or_null<ConstantInt>(MA.getModuleFlag(Key));
+  EXPECT_EQ(Val1, A1->getZExtValue());
+
+  // Set a flag to MB
+  EXPECT_EQ(nullptr, MB.getModuleFlag(Key));
+  MB.setModuleFlag(Module::ModFlagBehavior::Error, Key, Val1);
+  auto B1 = mdconst::extract_or_null<ConstantInt>(MB.getModuleFlag(Key));
+  EXPECT_EQ(Val1, B1->getZExtValue());
+
+  // Change the flag of MA
+  MA.setModuleFlag(Module::ModFlagBehavior::Error, Key, Val2);
+  auto A2 = mdconst::extract_or_null<ConstantInt>(MA.getModuleFlag(Key));
+  EXPECT_EQ(Val2, A2->getZExtValue());
+
+  // MB should keep the original flag value
+  auto B2 = mdconst::extract_or_null<ConstantInt>(MB.getModuleFlag(Key));
+  EXPECT_EQ(Val1, B2->getZExtValue());
+}
+
 const char *IRString = R"IR(
   !llvm.module.flags = !{!0}
 
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 53a64b6..ef6aeae 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -47,6 +47,7 @@ const char *ARMArch[] = {
     "armv9-a",     "armv9",     "armv9a",       "armv9.1-a",   "armv9.1a",
     "armv9.2-a",   "armv9.2a",  "armv9.3-a",    "armv9.3a",    "armv9.4-a",
     "armv9.4a",    "armv9.5-a", "armv9.5a",     "armv9.6a",    "armv9.6-a",
+    "armv9.7a",    "armv9.7-a",
 };
 
 std::string FormatExtensionFlags(int64_t Flags) {
@@ -622,6 +623,8 @@ TEST(TargetParserTest, testARMArch) {
                           ARMBuildAttrs::CPUArch::v9_A));
   EXPECT_TRUE(testARMArch("armv9.6-a", "generic", "v9.6a",
                           ARMBuildAttrs::CPUArch::v9_A));
+  EXPECT_TRUE(testARMArch("armv9.7-a", "generic", "v9.7a",
+                          ARMBuildAttrs::CPUArch::v9_A));
   EXPECT_TRUE(
       testARMArch("armv8-r", "generic", "v8r", ARMBuildAttrs::CPUArch::v8_R));
   EXPECT_TRUE(testARMArch("armv8-m.base", "generic", "v8m.base",
@@ -937,6 +940,7 @@ TEST(TargetParserTest, ARMparseArchProfile) {
     case ARM::ArchKind::ARMV9_4A:
     case ARM::ArchKind::ARMV9_5A:
     case ARM::ArchKind::ARMV9_6A:
+    case ARM::ArchKind::ARMV9_7A:
       EXPECT_EQ(ARM::ProfileKind::A, ARM::parseArchProfile(ARMArch[i]));
       break;
     default:
@@ -1294,6 +1298,7 @@ TEST(TargetParserTest, testAArch64Arch) {
   EXPECT_TRUE(testAArch64Arch("armv9.4-a"));
   EXPECT_TRUE(testAArch64Arch("armv9.5-a"));
   EXPECT_TRUE(testAArch64Arch("armv9.6-a"));
+  EXPECT_TRUE(testAArch64Arch("armv9.7-a"));
 }
 
 bool testAArch64Extension(StringRef CPUName, StringRef ArchExt) {
@@ -1438,7 +1443,13 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SVEAES,       AArch64::AEK_SME_MOP4,
       AArch64::AEK_SME_TMOP,     AArch64::AEK_SVEBITPERM,
       AArch64::AEK_SSVE_BITPERM, AArch64::AEK_SVESHA3,
-      AArch64::AEK_SVESM4,
+      AArch64::AEK_SVESM4,       AArch64::AEK_CMH,
+      AArch64::AEK_LSCP,         AArch64::AEK_TLBID,
+      AArch64::AEK_MPAMV2,       AArch64::AEK_MTETC,
+      AArch64::AEK_GCIE,         AArch64::AEK_SME2P3,
+      AArch64::AEK_SVE2P3,       AArch64::AEK_SVE_B16MM,
+      AArch64::AEK_F16MM,        AArch64::AEK_F16F32DOT,
+      AArch64::AEK_F16F32MM,
   };
 
   std::vector<StringRef> Features;
@@ -1550,6 +1561,18 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+pops"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-mop4"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-tmop"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+cmh"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+lscp"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+tlbid"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+mpamv2"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+mtetc"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+gcie"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sme2p3"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve2p3"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve-b16mm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f16mm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f16f32dot"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f16f32mm"));
 
   // Assuming we listed every extension above, this should produce the same
   // result.
@@ -1576,6 +1599,7 @@ TEST(TargetParserTest, AArch64ArchFeatures) {
   EXPECT_EQ(AArch64::ARMV9_4A.ArchFeature, "+v9.4a");
   EXPECT_EQ(AArch64::ARMV9_5A.ArchFeature, "+v9.5a");
   EXPECT_EQ(AArch64::ARMV9_6A.ArchFeature, "+v9.6a");
+  EXPECT_EQ(AArch64::ARMV9_7A.ArchFeature, "+v9.7a");
   EXPECT_EQ(AArch64::ARMV8R.ArchFeature, "+v8r");
 }
 
@@ -1605,7 +1629,8 @@ TEST(TargetParserTest, AArch64ArchPartialOrder) {
 
   for (const auto *A :
        {&AArch64::ARMV9_1A, &AArch64::ARMV9_2A, &AArch64::ARMV9_3A,
-        &AArch64::ARMV9_4A, &AArch64::ARMV9_5A, &AArch64::ARMV9_6A})
+        &AArch64::ARMV9_4A, &AArch64::ARMV9_5A, &AArch64::ARMV9_6A,
+        &AArch64::ARMV9_7A})
     EXPECT_TRUE(A->implies(AArch64::ARMV9A));
 
   EXPECT_TRUE(AArch64::ARMV8_1A.implies(AArch64::ARMV8A));
@@ -1624,6 +1649,7 @@ TEST(TargetParserTest, AArch64ArchPartialOrder) {
   EXPECT_TRUE(AArch64::ARMV9_4A.implies(AArch64::ARMV9_3A));
   EXPECT_TRUE(AArch64::ARMV9_5A.implies(AArch64::ARMV9_4A));
   EXPECT_TRUE(AArch64::ARMV9_6A.implies(AArch64::ARMV9_5A));
+  EXPECT_TRUE(AArch64::ARMV9_7A.implies(AArch64::ARMV9_6A));
 
   EXPECT_TRUE(AArch64::ARMV9A.implies(AArch64::ARMV8_5A));
   EXPECT_TRUE(AArch64::ARMV9_1A.implies(AArch64::ARMV8_6A));
@@ -1713,6 +1739,18 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"pops", "nopops", "+pops", "-pops"},
       {"sme-mop4", "nosme-mop4", "+sme-mop4", "-sme-mop4"},
       {"sme-tmop", "nosme-tmop", "+sme-tmop", "-sme-tmop"},
+      {"cmh", "nocmh", "+cmh", "-cmh"},
+      {"lscp", "nolscp", "+lscp", "-lscp"},
+      {"tlbid", "notlbid", "+tlbid", "-tlbid"},
+      {"mpamv2", "nompamv2", "+mpamv2", "-mpamv2"},
+      {"mtetc", "nomtetc", "+mtetc", "-mtetc"},
+      {"gcie", "nogcie", "+gcie", "-gcie"},
+      {"sme2p3", "nosme2p3", "+sme2p3", "-sme2p3"},
+      {"sve2p3", "nosve2p3", "+sve2p3", "-sve2p3"},
+      {"sve-b16mm", "nosve-b16mm", "+sve-b16mm", "-sve-b16mm"},
+      {"f16mm", "nof16mm", "+f16mm", "-f16mm"},
+      {"f16f32dot", "nof16f32dot", "+f16f32dot", "-f16f32dot"},
+      {"f16f32mm", "nof16f32mm", "+f16f32mm", "-f16f32mm"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {
@@ -1927,7 +1965,8 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nofp", "fprcvt"}, {"fp-armv8", "fprcvt"}, {}},
         {AArch64::ARMV9_6A, {"fprcvt", "nofp"}, {}, {"fp-armv8", "fprcvt"}},
 
-        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, faminmax, lut, fp8}
+        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, faminmax, lut, fp8,
+        // f16f32dot, f16f32mm}
         {AArch64::ARMV8A, {"nosimd", "aes"}, {"neon", "aes"}, {}},
         {AArch64::ARMV8A, {"aes", "nosimd"}, {}, {"neon", "aes"}},
         {AArch64::ARMV8A, {"nosimd", "sha2"}, {"neon", "sha2"}, {}},
@@ -1946,6 +1985,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nosimd", "lut"}, {"neon", "lut"}, {}},
         {AArch64::ARMV9_6A, {"fp8", "nosimd"}, {}, {"neon", "fp8"}},
         {AArch64::ARMV9_6A, {"nosimd", "fp8"}, {"neon", "fp8"}, {}},
+        {AArch64::ARMV9_7A, {"nosimd", "f16f32mm"}, {"neon", "f16f32mm"}, {}},
+        {AArch64::ARMV9_7A, {"f16f32mm", "nosimd"}, {}, {"neon", "f16f32mm"}},
+        {AArch64::ARMV9_7A, {"nosimd", "f16f32dot"}, {"neon", "f16f32dot"}, {}},
+        {AArch64::ARMV9_7A, {"f16f32dot", "nosimd"}, {}, {"neon", "f16f32dot"}},
 
         // fp8 -> {fp8dot4, fp8dot2}
         {AArch64::ARMV9_6A, {"nofp8", "fp8dot4"}, {"fp8", "fp8dot4"}, {}},
@@ -1961,17 +2004,35 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV8A, {"nosimd", "fcma"}, {"neon", "complxnum"}, {}},
         {AArch64::ARMV8A, {"fcma", "nosimd"}, {}, {"neon", "complxnum"}},
 
-        // fp16 -> {fp16fml, sve}
+        // fp16 -> {fp16fml, sve, f16f32dot, f16f32mm, f16mm}
         {AArch64::ARMV8A, {"nofp16", "fp16fml"}, {"fullfp16", "fp16fml"}, {}},
         {AArch64::ARMV8A, {"fp16fml", "nofp16"}, {}, {"fullfp16", "fp16fml"}},
         {AArch64::ARMV8A, {"nofp16", "sve"}, {"fullfp16", "sve"}, {}},
         {AArch64::ARMV8A, {"sve", "nofp16"}, {}, {"fullfp16", "sve"}},
+        {AArch64::ARMV9_7A, {"nofp16", "f16mm"}, {"fullfp16", "f16mm"}, {}},
+        {AArch64::ARMV9_7A, {"f16mm", "nofp16"}, {}, {"fullfp16", "f16mm"}},
+        {AArch64::ARMV9_7A,
+         {"nofp16", "f16f32mm"},
+         {"fullfp16", "f16f32mm"},
+         {}},
+        {AArch64::ARMV9_7A,
+         {"f16f32mm", "nofp16"},
+         {},
+         {"fullfp16", "f16f32mm"}},
+        {AArch64::ARMV9_7A,
+         {"nofp16", "f16f32dot"},
+         {"fullfp16", "f16f32dot"},
+         {}},
+        {AArch64::ARMV9_7A,
+         {"f16f32dot", "nofp16"},
+         {},
+         {"fullfp16", "f16f32dot"}},
 
         // bf16 -> {sme}
         {AArch64::ARMV8A, {"nobf16", "sme"}, {"bf16", "sme"}, {}},
         {AArch64::ARMV8A, {"sme", "nobf16"}, {}, {"bf16", "sme"}},
 
-        // sve -> {sve2, f32mm, f64mm, sve-f16f32mm}
+        // sve -> {sve2, f32mm, f64mm, sve-f16f32mm, sve-b16mm}
         {AArch64::ARMV8A, {"nosve", "sve2"}, {"sve", "sve2"}, {}},
         {AArch64::ARMV8A, {"sve2", "nosve"}, {}, {"sve", "sve2"}},
         {AArch64::ARMV8A, {"nosve", "f32mm"}, {"sve", "f32mm"}, {}},
@@ -1986,6 +2047,8 @@ AArch64ExtensionDependenciesBaseArchTestParams
          {"sve-f16f32mm", "nosve"},
          {},
          {"sve", "sve-f16f32mm"}},
+        {AArch64::ARMV9_7A, {"nosve", "sve-b16mm"}, {"sve", "sve-b16mm"}, {}},
+        {AArch64::ARMV9_7A, {"sve-b16mm", "nosve"}, {}, {"sve", "sve-b16mm"}},
 
         // aes -> {sve-aes}
         {AArch64::ARMV8A, {"noaes", "sve-aes"}, {"aes", "sve-aes"}, {}},
@@ -2031,6 +2094,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nosve2p1", "sve2p2"}, {"sve2p1", "sve2p2"}, {}},
         {AArch64::ARMV9_6A, {"sve2p2", "nosve2p1"}, {}, {"sve2p1", "sve2p2"}},
 
+        // sve2p2 -> {sve2p3}
+        {AArch64::ARMV9_7A, {"nosve2p2", "sve2p3"}, {"sve2p2", "sve2p3"}, {}},
+        {AArch64::ARMV9_7A, {"sve2p3", "nosve2p2"}, {}, {"sve2p2", "sve2p3"}},
+
         // sme -> {sme2, sme-f16f16, sme-f64f64, sme-i16i64, sme-fa64}
         {AArch64::ARMV8A, {"nosme", "sme2"}, {"sme", "sme2"}, {}},
         {AArch64::ARMV8A, {"sme2", "nosme"}, {}, {"sme", "sme2"}},
@@ -2084,6 +2151,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nosme2p1", "sme2p2"}, {"sme2p2", "sme2p1"}, {}},
         {AArch64::ARMV9_6A, {"sme2p2", "nosme2p1"}, {}, {"sme2p1", "sme2p2"}},
 
+        // sme2p2 -> {sme2p3}
+        {AArch64::ARMV9_7A, {"nosme2p2", "sme2p3"}, {"sme2p3", "sme2p2"}, {}},
+        {AArch64::ARMV9_7A, {"sme2p3", "nosme2p2"}, {}, {"sme2p2", "sme2p3"}},
+
         // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm, fp8dot4, fp8dot2,
         // ssve-fp8dot4, ssve-fp8dot2}
         {AArch64::ARMV8A, {"nofp8", "sme-f8f16"}, {"fp8", "sme-f8f16"}, {}},
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 39dacf7..4446702 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -122,6 +122,7 @@ static_library("CodeGen") {
     "MLRegAllocPriorityAdvisor.cpp",
     "MachineBasicBlock.cpp",
     "MachineBlockFrequencyInfo.cpp",
+    "MachineBlockHashInfo.cpp",
     "MachineBlockPlacement.cpp",
     "MachineBranchProbabilityInfo.cpp",
     "MachineCFGPrinter.cpp",
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
index 80ac4c6..dba9e2c 100644
--- a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -1,14 +1,19 @@
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""IR2Vec Triplet Generator
+"""IR2Vec/MIR2Vec Triplet Generator
 
-Generates IR2Vec triplets by applying random optimization levels to LLVM IR files
-and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
-files: entity2id.txt, relation2id.txt, and train2id.txt.
+Generates IR2Vec or MIR2Vec triplets by applying random optimization levels to
+LLVM IR files (or processing MIR files) and extracting triplets using llvm-ir2vec.
+Automatically generates preprocessed files (entity2id.txt, relation2id.txt, and
+train2id.txt) necessary for training IR2Vec or MIR2Vec vocabularies.
 
 Usage:
-    python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
+    For LLVM IR:
+        python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
+
+    For Machine IR:
+        python generateTriplets.py --mode=mir <llvm_build_dir> <mir_file_list> <output_dir>
 """
 
 import argparse
@@ -41,7 +46,7 @@ class TripletResult:
 
 
 class IR2VecTripletGenerator:
-    """Main class for generating IR2Vec triplets"""
+    """Main class for generating IR2Vec or MIR2Vec triplets"""
 
     def __init__(
         self,
@@ -49,11 +54,13 @@ class IR2VecTripletGenerator:
         num_optimizations: int,
         output_dir: Path,
         max_workers: int = DEFAULT_MAX_WORKERS,
+        mode: str = "llvm",
     ):
         self.llvm_build_dir = llvm_build_dir
         self.num_optimizations = num_optimizations
         self.output_dir = output_dir
         self.max_workers = max_workers
+        self.mode = mode  # "llvm" or "mir"
 
         # Tool paths
         self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
@@ -85,7 +92,11 @@ class IR2VecTripletGenerator:
                 f"llvm-ir2vec binary not found or not executable: {self.ir2vec_binary}"
             )
 
-        if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+        if self.mode not in ["llvm", "mir"]:
+            raise ValueError(f"Mode must be 'llvm' or 'mir', got: {self.mode}")
+
+        # For LLVM IR mode, validate optimization count
+        if self.mode == "llvm" and not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
             raise ValueError(
                 f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
             )
@@ -95,19 +106,28 @@ class IR2VecTripletGenerator:
         return random.sample(OPT_LEVELS, self.num_optimizations)
 
     def _process_single_file(self, input_file: Path) -> TripletResult:
-        """Process a single LLVM IR file with multiple optimization levels"""
+        """Process a single LLVM IR or MIR file"""
         all_triplets = set()
         max_relation = 1
-        opt_levels = self._select_optimization_levels()
 
-        for opt_level in opt_levels:
-            triplets, file_max_relation = self._run_pipeline(input_file, opt_level)
+        if self.mode == "mir":
+            # For MIR files, process directly without optimization
+            triplets, file_max_relation = self._run_mir_pipeline(input_file)
             if triplets:
                 all_triplets.update(triplets)
                 max_relation = max(max_relation, file_max_relation)
-                logger.debug(
-                    f"Generated {len(triplets)} triplets for {input_file} with {opt_level}"
-                )
+                logger.debug(f"Generated {len(triplets)} triplets for {input_file}")
+        else:
+            # For LLVM IR files, apply multiple optimization levels
+            opt_levels = self._select_optimization_levels()
+            for opt_level in opt_levels:
+                triplets, file_max_relation = self._run_pipeline(input_file, opt_level)
+                if triplets:
+                    all_triplets.update(triplets)
+                    max_relation = max(max_relation, file_max_relation)
+                    logger.debug(
+                        f"Generated {len(triplets)} triplets for {input_file} with {opt_level}"
+                    )
 
         return TripletResult(all_triplets, max_relation)
 
@@ -124,7 +144,7 @@ class IR2VecTripletGenerator:
 
             # Run llvm-ir2vec with opt's output as input
             ir2vec_proc = subprocess.Popen(
-                [self.ir2vec_binary, "triplets", "-", "-o", "-"],
+                [self.ir2vec_binary, "triplets", "--mode=llvm", "-", "-o", "-"],
                 stdin=opt_proc.stdout,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
@@ -143,6 +163,32 @@ class IR2VecTripletGenerator:
         except (subprocess.SubprocessError, OSError):
             return set(), 1
 
+    def _run_mir_pipeline(self, input_file: Path) -> Tuple[Set[str], int]:
+        """Run llvm-ir2vec pipeline for MIR files."""
+        try:
+            # Run llvm-ir2vec directly on MIR file
+            result = subprocess.run(
+                [
+                    self.ir2vec_binary,
+                    "triplets",
+                    "--mode=mir",
+                    str(input_file),
+                    "-o",
+                    "-",
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=False,
+            )
+
+            if result.returncode != 0:
+                return set(), 1
+
+            return self._parse_triplet_output(result.stdout)
+        except (subprocess.SubprocessError, OSError):
+            return set(), 1
+
     def _parse_triplet_output(self, output: str) -> Tuple[Set[str], int]:
         """Parse triplet output and extract max relation"""
         if not output.strip():
@@ -160,12 +206,21 @@ class IR2VecTripletGenerator:
         return set(lines), max_relation
 
     def generate_triplets(self, file_list: Path) -> None:
-        """Main method to generate triplets from a list of LLVM IR files"""
+        """Main method to generate triplets from a list of LLVM IR or MIR files"""
+        # Store file_list_path for later use in entity generation
+        self.file_list_path = file_list
+
         input_files = self._read_file_list(file_list)
-        logger.info(
-            f"Processing {len(input_files)} files with {self.num_optimizations} "
-            f"optimization levels using {self.max_workers} workers"
-        )
+
+        if self.mode == "mir":
+            logger.info(
+                f"Processing {len(input_files)} MIR files using {self.max_workers} workers"
+            )
+        else:
+            logger.info(
+                f"Processing {len(input_files)} files with {self.num_optimizations} "
+                f"optimization levels using {self.max_workers} workers"
+            )
 
         all_triplets = set()
         global_max_relation = 1
@@ -222,28 +277,60 @@ class IR2VecTripletGenerator:
 
     def _generate_entity2id(self, output_file: Path) -> None:
         """Generate entity2id.txt using llvm-ir2vec"""
-        subprocess.run(
-            [str(self.ir2vec_binary), "entities", "-o", str(output_file)],
-            check=True,
-            capture_output=True,
-        )
+        if self.mode == "mir":
+            # For MIR mode, we need to provide a sample MIR file to determine target
+            # Use the first file from the processed list
+            input_files = self._read_file_list(self.file_list_path)
+            if not input_files:
+                raise ValueError("No input files available for entity generation")
+
+            subprocess.run(
+                [
+                    str(self.ir2vec_binary),
+                    "entities",
+                    "--mode=mir",
+                    str(input_files[0]),
+                    "-o",
+                    str(output_file),
+                ],
+                check=True,
+                capture_output=True,
+            )
+        else:
+            subprocess.run(
+                [
+                    str(self.ir2vec_binary),
+                    "entities",
+                    "--mode=llvm",
+                    "-o",
+                    str(output_file),
+                ],
+                check=True,
+                capture_output=True,
+            )
 
     def _generate_relation2id(self, output_file: Path, max_relation: int) -> None:
         """Generate relation2id.txt from max relation"""
-        max_relation = max(max_relation, 1)  # At least Type and Next relations
+        max_relation = max(max_relation, 1)  # At least Next relation
         num_relations = max_relation + 1
 
         with open(output_file, "w") as f:
             f.write(f"{num_relations}\n")
-            f.write("Type\t0\n")
-            f.write("Next\t1\n")
-            f.writelines(f"Arg{i-2}\t{i}\n" for i in range(2, num_relations))
+            if self.mode == "llvm":
+                # LLVM IR has Type relation at 0
+                f.write("Type\t0\n")
+                f.write("Next\t1\n")
+                f.writelines(f"Arg{i-2}\t{i}\n" for i in range(2, num_relations))
+            else:
+                # MIR doesn't have Type relation, starts with Next at 0
+                f.write("Next\t0\n")
+                f.writelines(f"Arg{i-1}\t{i}\n" for i in range(1, num_relations))
 
 
 def main():
     """Main entry point"""
     parser = argparse.ArgumentParser(
-        description="Generate IR2Vec triplets from LLVM IR files",
+        description="Generate IR2Vec or MIR2Vec triplets from LLVM IR or Machine IR files",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
 
@@ -253,17 +340,26 @@ def main():
     parser.add_argument(
         "num_optimizations",
         type=int,
-        help="Number of optimization levels to apply (1-6)",
+        nargs="?",
+        default=1,
+        help="Number of optimization levels to apply (1-6) for LLVM IR mode",
     )
     parser.add_argument(
-        "ll_file_list",
+        "input_file_list",
         type=Path,
-        help="File containing list of LLVM IR files to process",
+        help="File containing list of LLVM IR or MIR files to process",
     )
     parser.add_argument(
         "output_dir", type=Path, help="Output directory for generated files"
     )
     parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["llvm", "mir"],
+        default="llvm",
+        help="Operation mode: 'llvm' for LLVM IR (default) or 'mir' for Machine IR",
+    )
+    parser.add_argument(
         "-j",
         "--max-workers",
         type=int,
@@ -296,8 +392,9 @@ def main():
         args.num_optimizations,
         args.output_dir,
         args.max_workers,
+        args.mode,
     )
-    generator.generate_triplets(args.ll_file_list)
+    generator.generate_triplets(args.input_file_list)
 
 
 if __name__ == "__main__":
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index b56e778..b88fbaa 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -260,9 +260,9 @@ static std::string getDecorationName(StringRef attrName) {
 }
 
 template <typename AttrTy, typename EmitF>
-LogicalResult processDecorationList(Location loc, Decoration decoration,
-                                    Attribute attrList, StringRef attrName,
-                                    EmitF emitter) {
+static LogicalResult processDecorationList(Location loc, Decoration decoration,
+                                           Attribute attrList,
+                                           StringRef attrName, EmitF emitter) {
   auto arrayAttr = dyn_cast<ArrayAttr>(attrList);
   if (!arrayAttr) {
     return emitError(loc, "expecting array attribute of ")